From 8555255f0b426858d8648c6206b70eb906cf4ec7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:14:01 +0100
Subject: Add generic Kbuild files for 'make headers_install'

This adds the Kbuild files listing the files which are to be installed by
the 'headers_install' make target, in generic directories.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/Kbuild                        |  2 ++
 include/asm-generic/Kbuild            |  3 ++
 include/asm-generic/Kbuild.asm        | 11 ++++++
 include/linux/Kbuild                  | 63 +++++++++++++++++++++++++++++++++++
 include/linux/byteorder/Kbuild        |  2 ++
 include/linux/dvb/Kbuild              |  2 ++
 include/linux/hdlc/Kbuild             |  1 +
 include/linux/isdn/Kbuild             |  1 +
 include/linux/netfilter/Kbuild        | 11 ++++++
 include/linux/netfilter_arp/Kbuild    |  2 ++
 include/linux/netfilter_bridge/Kbuild |  4 +++
 include/linux/netfilter_ipv4/Kbuild   | 21 ++++++++++++
 include/linux/netfilter_ipv6/Kbuild   |  6 ++++
 include/linux/nfsd/Kbuild             |  2 ++
 include/linux/raid/Kbuild             |  1 +
 include/linux/sunrpc/Kbuild           |  1 +
 include/linux/tc_act/Kbuild           |  1 +
 include/linux/tc_ematch/Kbuild        |  1 +
 include/mtd/Kbuild                    |  2 ++
 include/rdma/Kbuild                   |  1 +
 include/scsi/Kbuild                   |  2 ++
 include/sound/Kbuild                  |  2 ++
 include/video/Kbuild                  |  1 +
 23 files changed, 143 insertions(+)
 create mode 100644 include/Kbuild
 create mode 100644 include/asm-generic/Kbuild
 create mode 100644 include/asm-generic/Kbuild.asm
 create mode 100644 include/linux/Kbuild
 create mode 100644 include/linux/byteorder/Kbuild
 create mode 100644 include/linux/dvb/Kbuild
 create mode 100644 include/linux/hdlc/Kbuild
 create mode 100644 include/linux/isdn/Kbuild
 create mode 100644 include/linux/netfilter/Kbuild
 create mode 100644 include/linux/netfilter_arp/Kbuild
 create mode 100644 include/linux/netfilter_bridge/Kbuild
 create mode 100644 include/linux/netfilter_ipv4/Kbuild
 create mode 100644 include/linux/netfilter_ipv6/Kbuild
 create mode 100644 include/linux/nfsd/Kbuild
 create mode 100644 include/linux/raid/Kbuild
 create mode 100644 include/linux/sunrpc/Kbuild
 create mode 100644 include/linux/tc_act/Kbuild
 create mode 100644 include/linux/tc_ematch/Kbuild
 create mode 100644 include/mtd/Kbuild
 create mode 100644 include/rdma/Kbuild
 create mode 100644 include/scsi/Kbuild
 create mode 100644 include/sound/Kbuild
 create mode 100644 include/video/Kbuild

(limited to 'include')

diff --git a/include/Kbuild b/include/Kbuild
new file mode 100644
index 000000000000..cb2534800b19
--- /dev/null
+++ b/include/Kbuild
@@ -0,0 +1,2 @@
+header-y += asm-generic/ linux/ scsi/ sound/ mtd/ rdma/ video/
+header-y += asm-$(ARCH)/ 
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
new file mode 100644
index 000000000000..70594b275a6e
--- /dev/null
+++ b/include/asm-generic/Kbuild
@@ -0,0 +1,3 @@
+header-y += atomic.h errno-base.h errno.h fcntl.h ioctl.h ipc.h mman.h \
+	signal.h statfs.h
+unifdef-y := resource.h siginfo.h
diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm
new file mode 100644
index 000000000000..d8d0bcecd23f
--- /dev/null
+++ b/include/asm-generic/Kbuild.asm
@@ -0,0 +1,11 @@
+unifdef-y += a.out.h auxvec.h byteorder.h errno.h fcntl.h ioctl.h	\
+	ioctls.h ipcbuf.h irq.h mman.h msgbuf.h param.h poll.h		\
+	posix_types.h ptrace.h resource.h sembuf.h shmbuf.h shmparam.h	\
+	sigcontext.h siginfo.h signal.h socket.h sockios.h stat.h	\
+	statfs.h termbits.h termios.h timex.h types.h unistd.h user.h
+
+# These really shouldn't be exported
+unifdef-y += atomic.h io.h
+
+# These probably shouldn't be exported
+unifdef-y += elf.h page.h
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
new file mode 100644
index 000000000000..5c95aa4dc0b4
--- /dev/null
+++ b/include/linux/Kbuild
@@ -0,0 +1,63 @@
+header-y := byteorder/ dvb/ hdlc/ isdn/ nfsd/ raid/ sunrpc/ tc_act/	\
+	netfilter/ netfilter_arp/ netfilter_bridge/ netfilter_ipv4/	\
+	netfilter_ipv6/
+
+header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h	\
+	atmapi.h atmbr2684.h atmclip.h atm_eni.h atm_he.h		\
+	atm_idt77105.h atmioc.h atmlec.h atmmpc.h atm_nicstar.h		\
+	atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h	\
+	awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h		\
+	bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h		\
+	consolemap.h cycx_cfm.h devfs_fs.h dm-ioctl.h dn.h dqblk_v1.h	\
+	dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h	\
+	fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h	\
+	fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h	\
+	hpfs_fs.h hysdn_if.h i2c-dev.h i2c-id.h i8k.h icmp.h		\
+	if_arcnet.h if_arp.h if_bonding.h if_cablemodem.h if_fc.h	\
+	if_fddi.h if.h if_hippi.h if_infiniband.h if_packet.h		\
+	if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h	\
+	in_route.h ioctl.h ip.h ipmi_msgdefs.h ip_mp_alg.h ipsec.h	\
+	ipx.h irda.h isdn_divertif.h iso_fs.h ite_gpio.h ixjuser.h	\
+	jffs2.h keyctl.h limits.h major.h matroxfb.h meye.h minix_fs.h	\
+	mmtimer.h mqueue.h mtio.h ncp_no.h netfilter_arp.h netrom.h	\
+	nfs2.h nfs4_mount.h nfs_mount.h openprom_fs.h param.h		\
+	pci_ids.h pci_regs.h personality.h pfkeyv2.h pg.h pkt_cls.h	\
+	pkt_sched.h posix_types.h ppdev.h prctl.h ps2esdi.h qic117.h	\
+	qnxtypes.h quotaio_v1.h quotaio_v2.h radeonfb.h raw.h		\
+	resource.h rose.h sctp.h smbno.h snmp.h sockios.h som.h		\
+	sound.h stddef.h synclink.h telephony.h termios.h ticable.h	\
+	times.h tiocl.h tipc.h toshiba.h ultrasound.h un.h utime.h	\
+	utsname.h video_decoder.h video_encoder.h videotext.h vt.h	\
+	wavefront.h wireless.h xattr.h x25.h zorro_ids.h
+
+unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h	\
+	atmarp.h atmdev.h atm.h atm_tcp.h audit.h auto_fs.h binfmts.h	\
+	capability.h capi.h cciss_ioctl.h cdrom.h cm4000_cs.h		\
+	cn_proc.h coda.h connector.h cramfs_fs.h cuda.h cyclades.h	\
+	dccp.h dirent.h divert.h elfcore.h errno.h errqueue.h		\
+	ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h		\
+	filter.h flat.h fs.h ftape.h gameport.h generic_serial.h	\
+	genhd.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h hiddev.h hpet.h	\
+	i2c-algo-ite.h i2c.h i2o-dev.h icmpv6.h if_bridge.h if_ec.h	\
+	if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h		\
+	if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h	\
+	inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h		\
+	ipv6_route.h isdn.h isdnif.h isdn_ppp.h isicom.h jbd.h		\
+	joystick.h kdev_t.h kd.h kernelcapi.h kernel.h keyboard.h	\
+	llc.h loop.h lp.h mempolicy.h mii.h mman.h mroute.h msdos_fs.h	\
+	msg.h nbd.h ncp_fs.h ncp.h ncp_mount.h netdevice.h		\
+	netfilter_bridge.h netfilter_decnet.h netfilter.h		\
+	netfilter_ipv4.h netfilter_ipv6.h netfilter_logging.h net.h	\
+	netlink.h nfs3.h nfs4.h nfsacl.h nfs_fs.h nfs.h nfs_idmap.h	\
+	n_r3964.h nubus.h nvram.h parport.h patchkey.h pci.h pktcdvd.h	\
+	pmu.h poll.h ppp_defs.h ppp-comp.h ptrace.h qnx4_fs.h quota.h	\
+	random.h reboot.h reiserfs_fs.h reiserfs_xattr.h romfs_fs.h	\
+	route.h rtc.h rtnetlink.h scc.h sched.h sdla.h			\
+	selinux_netlink.h sem.h serial_core.h serial.h serio.h shm.h	\
+	signal.h smb_fs.h smb.h smb_mount.h socket.h sonet.h sonypi.h	\
+	soundcard.h stat.h sysctl.h tcp.h time.h timex.h tty.h types.h	\
+	udf_fs_i.h udp.h uinput.h uio.h unistd.h usb_ch9.h		\
+	usbdevice_fs.h user.h videodev2.h videodev.h wait.h		\
+	wanrouter.h watchdog.h xfrm.h zftape.h
+
+objhdr-y := version.h
diff --git a/include/linux/byteorder/Kbuild b/include/linux/byteorder/Kbuild
new file mode 100644
index 000000000000..84a57d4fb212
--- /dev/null
+++ b/include/linux/byteorder/Kbuild
@@ -0,0 +1,2 @@
+unifdef-y += generic.h swabb.h swab.h
+header-y += big_endian.h little_endian.h pdp_endian.h
diff --git a/include/linux/dvb/Kbuild b/include/linux/dvb/Kbuild
new file mode 100644
index 000000000000..63973af72fd5
--- /dev/null
+++ b/include/linux/dvb/Kbuild
@@ -0,0 +1,2 @@
+header-y += ca.h frontend.h net.h osd.h version.h
+unifdef-y := audio.h dmx.h video.h
diff --git a/include/linux/hdlc/Kbuild b/include/linux/hdlc/Kbuild
new file mode 100644
index 000000000000..1fb26448faa9
--- /dev/null
+++ b/include/linux/hdlc/Kbuild
@@ -0,0 +1 @@
+header-y += ioctl.h
diff --git a/include/linux/isdn/Kbuild b/include/linux/isdn/Kbuild
new file mode 100644
index 000000000000..c1727c893004
--- /dev/null
+++ b/include/linux/isdn/Kbuild
@@ -0,0 +1 @@
+header-y += capicmd.h tpam.h
diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
new file mode 100644
index 000000000000..d06311acd448
--- /dev/null
+++ b/include/linux/netfilter/Kbuild
@@ -0,0 +1,11 @@
+header-y := nf_conntrack_sctp.h nf_conntrack_tuple_common.h		\
+	    nfnetlink_conntrack.h nfnetlink_log.h nfnetlink_queue.h	\
+	    xt_CLASSIFY.h xt_comment.h xt_connbytes.h xt_connmark.h	\
+	    xt_CONNMARK.h xt_conntrack.h xt_dccp.h xt_esp.h		\
+	    xt_helper.h xt_length.h xt_limit.h xt_mac.h xt_mark.h	\
+	    xt_MARK.h xt_multiport.h xt_NFQUEUE.h xt_pkttype.h		\
+	    xt_policy.h xt_realm.h xt_sctp.h xt_state.h xt_string.h	\
+	    xt_tcpmss.h xt_tcpudp.h
+
+unifdef-y := nf_conntrack_common.h nf_conntrack_ftp.h		\
+	nf_conntrack_tcp.h nfnetlink.h x_tables.h xt_physdev.h
diff --git a/include/linux/netfilter_arp/Kbuild b/include/linux/netfilter_arp/Kbuild
new file mode 100644
index 000000000000..198ec5e7b17d
--- /dev/null
+++ b/include/linux/netfilter_arp/Kbuild
@@ -0,0 +1,2 @@
+header-y := arpt_mangle.h
+unifdef-y := arp_tables.h
diff --git a/include/linux/netfilter_bridge/Kbuild b/include/linux/netfilter_bridge/Kbuild
new file mode 100644
index 000000000000..5b1aba6abbad
--- /dev/null
+++ b/include/linux/netfilter_bridge/Kbuild
@@ -0,0 +1,4 @@
+header-y += ebt_among.h ebt_arp.h ebt_arpreply.h ebt_ip.h ebt_limit.h	\
+	ebt_log.h ebt_mark_m.h ebt_mark_t.h ebt_nat.h ebt_pkttype.h	\
+	ebt_redirect.h ebt_stp.h ebt_ulog.h ebt_vlan.h
+unifdef-y := ebtables.h ebt_802_3.h
diff --git a/include/linux/netfilter_ipv4/Kbuild b/include/linux/netfilter_ipv4/Kbuild
new file mode 100644
index 000000000000..04e4d2721689
--- /dev/null
+++ b/include/linux/netfilter_ipv4/Kbuild
@@ -0,0 +1,21 @@
+
+header-y := ip_conntrack_helper.h ip_conntrack_helper_h323_asn1.h	\
+	    ip_conntrack_helper_h323_types.h ip_conntrack_protocol.h	\
+	    ip_conntrack_sctp.h ip_conntrack_tcp.h ip_conntrack_tftp.h	\
+	    ip_nat_pptp.h ipt_addrtype.h ipt_ah.h	\
+	    ipt_CLASSIFY.h ipt_CLUSTERIP.h ipt_comment.h		\
+	    ipt_connbytes.h ipt_connmark.h ipt_CONNMARK.h		\
+	    ipt_conntrack.h ipt_dccp.h ipt_dscp.h ipt_DSCP.h ipt_ecn.h	\
+	    ipt_ECN.h ipt_esp.h ipt_hashlimit.h ipt_helper.h		\
+	    ipt_iprange.h ipt_length.h ipt_limit.h ipt_LOG.h ipt_mac.h	\
+	    ipt_mark.h ipt_MARK.h ipt_multiport.h ipt_NFQUEUE.h		\
+	    ipt_owner.h ipt_physdev.h ipt_pkttype.h ipt_policy.h	\
+	    ipt_realm.h ipt_recent.h ipt_REJECT.h ipt_SAME.h		\
+	    ipt_sctp.h ipt_state.h ipt_string.h ipt_tcpmss.h		\
+	    ipt_TCPMSS.h ipt_tos.h ipt_TOS.h ipt_ttl.h ipt_TTL.h	\
+	    ipt_ULOG.h
+
+unifdef-y := ip_conntrack.h ip_conntrack_h323.h ip_conntrack_irc.h	\
+	ip_conntrack_pptp.h ip_conntrack_proto_gre.h			\
+	ip_conntrack_tuple.h ip_nat.h ip_nat_rule.h ip_queue.h		\
+	ip_tables.h
diff --git a/include/linux/netfilter_ipv6/Kbuild b/include/linux/netfilter_ipv6/Kbuild
new file mode 100644
index 000000000000..913ddbf55b4b
--- /dev/null
+++ b/include/linux/netfilter_ipv6/Kbuild
@@ -0,0 +1,6 @@
+header-y += ip6t_HL.h ip6t_LOG.h ip6t_MARK.h ip6t_REJECT.h ip6t_ah.h	\
+	ip6t_esp.h ip6t_frag.h ip6t_hl.h ip6t_ipv6header.h		\
+	ip6t_length.h ip6t_limit.h ip6t_mac.h ip6t_mark.h		\
+	ip6t_multiport.h ip6t_opts.h ip6t_owner.h ip6t_policy.h		\
+	ip6t_physdev.h ip6t_rt.h
+unifdef-y := ip6_tables.h
diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild
new file mode 100644
index 000000000000..c8c545665885
--- /dev/null
+++ b/include/linux/nfsd/Kbuild
@@ -0,0 +1,2 @@
+unifdef-y := const.h export.h stats.h syscall.h nfsfh.h debug.h auth.h
+
diff --git a/include/linux/raid/Kbuild b/include/linux/raid/Kbuild
new file mode 100644
index 000000000000..73fa27a8d552
--- /dev/null
+++ b/include/linux/raid/Kbuild
@@ -0,0 +1 @@
+header-y += md_p.h md_u.h
diff --git a/include/linux/sunrpc/Kbuild b/include/linux/sunrpc/Kbuild
new file mode 100644
index 000000000000..0d1d768a27bf
--- /dev/null
+++ b/include/linux/sunrpc/Kbuild
@@ -0,0 +1 @@
+unifdef-y := debug.h
diff --git a/include/linux/tc_act/Kbuild b/include/linux/tc_act/Kbuild
new file mode 100644
index 000000000000..5251a505b2f1
--- /dev/null
+++ b/include/linux/tc_act/Kbuild
@@ -0,0 +1 @@
+header-y += tc_gact.h tc_ipt.h tc_mirred.h tc_pedit.h
diff --git a/include/linux/tc_ematch/Kbuild b/include/linux/tc_ematch/Kbuild
new file mode 100644
index 000000000000..381e93018df6
--- /dev/null
+++ b/include/linux/tc_ematch/Kbuild
@@ -0,0 +1 @@
+headers-y := tc_em_cmp.h tc_em_meta.h tc_em_nbyte.h tc_em_text.h
diff --git a/include/mtd/Kbuild b/include/mtd/Kbuild
new file mode 100644
index 000000000000..e1da2a5b2a57
--- /dev/null
+++ b/include/mtd/Kbuild
@@ -0,0 +1,2 @@
+unifdef-y := mtd-abi.h
+header-y := inftl-user.h jffs2-user.h mtd-user.h nftl-user.h
diff --git a/include/rdma/Kbuild b/include/rdma/Kbuild
new file mode 100644
index 000000000000..eb710ba9b1a0
--- /dev/null
+++ b/include/rdma/Kbuild
@@ -0,0 +1 @@
+header-y := ib_user_mad.h
diff --git a/include/scsi/Kbuild b/include/scsi/Kbuild
new file mode 100644
index 000000000000..14a033d73314
--- /dev/null
+++ b/include/scsi/Kbuild
@@ -0,0 +1,2 @@
+header-y += scsi.h
+unifdef-y := scsi_ioctl.h sg.h
diff --git a/include/sound/Kbuild b/include/sound/Kbuild
new file mode 100644
index 000000000000..3a5a3df61496
--- /dev/null
+++ b/include/sound/Kbuild
@@ -0,0 +1,2 @@
+header-y := asound_fm.h hdsp.h hdspm.h sfnt_info.h sscape_ioctl.h
+unifdef-y := asequencer.h asound.h emu10k1.h sb16_csp.h 
diff --git a/include/video/Kbuild b/include/video/Kbuild
new file mode 100644
index 000000000000..76a60737cc15
--- /dev/null
+++ b/include/video/Kbuild
@@ -0,0 +1 @@
+unifdef-y := sisfb.h
-- 
cgit v1.2.3


From 6a87a86d064cee49b3c298c558b990b65d736cf4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:17:47 +0100
Subject: Add Kbuild file for PowerPC 'make headers_install'

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-powerpc/Kbuild | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 include/asm-powerpc/Kbuild

(limited to 'include')

diff --git a/include/asm-powerpc/Kbuild b/include/asm-powerpc/Kbuild
new file mode 100644
index 000000000000..ac61d7eb6021
--- /dev/null
+++ b/include/asm-powerpc/Kbuild
@@ -0,0 +1,10 @@
+include include/asm-generic/Kbuild.asm
+
+unifdef-y += a.out.h asm-compat.h bootx.h byteorder.h cputable.h elf.h	\
+	nvram.h param.h posix_types.h ptrace.h seccomp.h signal.h	\
+	termios.h types.h unistd.h
+
+header-y += auxvec.h ioctls.h mman.h sembuf.h siginfo.h stat.h errno.h	\
+	ipcbuf.h msgbuf.h shmbuf.h socket.h termbits.h fcntl.h ipc.h	\
+	poll.h shmparam.h sockios.h ucontext.h ioctl.h linkage.h	\
+	resource.h sigcontext.h statfs.h
-- 
cgit v1.2.3


From 54863bb0d8782bbc81fe0275f1d20042fa14e32f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:19:11 +0100
Subject: Add Kbuild file for x86_64 'make headers_install'

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-x86_64/Kbuild | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 include/asm-x86_64/Kbuild

(limited to 'include')

diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild
new file mode 100644
index 000000000000..dc4d101e8a16
--- /dev/null
+++ b/include/asm-x86_64/Kbuild
@@ -0,0 +1,11 @@
+include include/asm-generic/Kbuild.asm
+
+ALTARCH := i386
+ARCHDEF := defined __x86_64__
+ALTARCHDEF := defined __i386__
+
+header-y += boot.h bootsetup.h cpufeature.h debugreg.h ldt.h \
+	 msr.h prctl.h setup.h sigcontext32.h ucontext.h \
+	 vsyscall32.h
+
+unifdef-y += mce.h mtrr.h vsyscall.h
-- 
cgit v1.2.3


From c509075e711305d591760e4f0e8f57ff178cd501 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:19:46 +0100
Subject: Add Kbuild file for i386 'make headers_install'

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-i386/Kbuild | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 include/asm-i386/Kbuild

(limited to 'include')

diff --git a/include/asm-i386/Kbuild b/include/asm-i386/Kbuild
new file mode 100644
index 000000000000..c064a8e9170f
--- /dev/null
+++ b/include/asm-i386/Kbuild
@@ -0,0 +1,5 @@
+include include/asm-generic/Kbuild.asm
+
+header-y += boot.h cpufeature.h debugreg.h ldt.h setup.h ucontext.h
+
+unifdef-y += mtrr.h vm86.h
-- 
cgit v1.2.3


From 1e7c34994d809820dbe31220397b62660d8d1514 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:22:02 +0100
Subject: Add Kbuild file for S390 'make headers_install'

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-s390/Kbuild | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 include/asm-s390/Kbuild

(limited to 'include')

diff --git a/include/asm-s390/Kbuild b/include/asm-s390/Kbuild
new file mode 100644
index 000000000000..ed8955f49e47
--- /dev/null
+++ b/include/asm-s390/Kbuild
@@ -0,0 +1,4 @@
+include include/asm-generic/Kbuild.asm
+
+unifdef-y += cmb.h debug.h
+header-y += dasd.h qeth.h tape390.h ucontext.h vtoc.h z90crypt.h
-- 
cgit v1.2.3


From dc901d6d2596d45f2c398e735125e5123b4e2774 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:22:20 +0100
Subject: Add Kbuild file for IA64 'make headers_install'

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-ia64/Kbuild | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 include/asm-ia64/Kbuild

(limited to 'include')

diff --git a/include/asm-ia64/Kbuild b/include/asm-ia64/Kbuild
new file mode 100644
index 000000000000..85d6f8005eb4
--- /dev/null
+++ b/include/asm-ia64/Kbuild
@@ -0,0 +1,7 @@
+include include/asm-generic/Kbuild.asm
+
+header-y += break.h fpu.h fpswa.h gcc_intrin.h ia64regs.h		\
+	 intel_intrin.h intrinsics.h perfmon_default_smpl.h	\
+	 ptrace_offsets.h rse.h setup.h ucontext.h
+
+unifdef-y += perfmon.h
-- 
cgit v1.2.3


From a29ee9f3bf49d9a19def177145dcb85fc021c096 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:22:49 +0100
Subject: Add Kbuild file for SPARC 'make headers_install'

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-sparc/Kbuild   | 13 +++++++++++++
 include/asm-sparc64/Kbuild | 16 ++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 include/asm-sparc/Kbuild
 create mode 100644 include/asm-sparc64/Kbuild

(limited to 'include')

diff --git a/include/asm-sparc/Kbuild b/include/asm-sparc/Kbuild
new file mode 100644
index 000000000000..cecdc6145546
--- /dev/null
+++ b/include/asm-sparc/Kbuild
@@ -0,0 +1,13 @@
+include include/asm-generic/Kbuild.asm
+
+header-y += apc.h asi.h asmmacro.h auxio.h bitext.h bpp.h \
+	bsderrno.h btfixup.h clock.h contregs.h cpudata.h cypress.h \
+	ebus.h ecc.h eeprom.h fbio.h floppy.h head.h hw_irq.h \
+	idprom.h io-unit.h iommu.h ipc.h jsflash.h \
+	kdebug.h machines.h mbus.h memreg.h mostek.h mpmbox.h msi.h \
+	mxcc.h obio.h openprom.h openpromio.h oplib.h pbm.h pcic.h \
+	pconf.h perfctr.h pgtsrmmu.h pgtsun4.h pgtsun4c.h psr.h reg.h \
+	ross.h sbi.h sbus.h sfp-machine.h smpprim.h \
+	solerrno.h spinlock.h sun4paddr.h sun4prom.h sunbpp.h svr4.h \
+	swift.h sysen.h timer.h traps.h tsunami.h turbosparc.h \
+	vac-ops.h vaddrs.h vfc_ioctls.h viking.h winmacro.h
diff --git a/include/asm-sparc64/Kbuild b/include/asm-sparc64/Kbuild
new file mode 100644
index 000000000000..5edf7643a209
--- /dev/null
+++ b/include/asm-sparc64/Kbuild
@@ -0,0 +1,16 @@
+include include/asm-generic/Kbuild.asm
+
+ALTARCH := sparc
+ARCHDEF := defined __sparc__ && defined __arch64__ 
+ALTARCHDEF := defined __sparc__ && !defined __arch64__
+
+header-y += agp.h apb.h asi.h auxio.h bbc.h bpp.h bsderrno.h \
+	chafsr.h chmctrl.h compat.h const.h cpudata.h dcr.h dcu.h \
+	display7seg.h ebus.h envctrl.h estate.h fbio.h fhc.h floppy.h \
+	fpumacro.h head.h hw_irq.h idprom.h iommu.h \
+	ipc.h isa.h kdebug.h lsu.h mostek.h ns87303.h \
+	openprom.h openpromio.h oplib.h parport.h pbm.h pconf.h \
+	perfctr.h pil.h psrcompat.h pstate.h reg.h sbus.h \
+	sfp-machine.h solerrno.h spinlock.h spitfire.h starfire.h \
+	sunbpp.h svr4.h timer.h ttable.h uctx.h upa.h utrap.h vga.h \
+	visasm.h watchdog.h
-- 
cgit v1.2.3


From 57e580f885a62c21fe75e35e9e445fb6fb82509c Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:45:15 +0100
Subject: Add Kbuild file for Alpha 'make headers_install'

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-alpha/Kbuild | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 include/asm-alpha/Kbuild

(limited to 'include')

diff --git a/include/asm-alpha/Kbuild b/include/asm-alpha/Kbuild
new file mode 100644
index 000000000000..e57fd57538b8
--- /dev/null
+++ b/include/asm-alpha/Kbuild
@@ -0,0 +1,5 @@
+include include/asm-generic/Kbuild.asm
+
+unifdef-y += console.h fpu.h sysinfo.h
+
+header-y += gentrap.h regdef.h pal.h reg.h
-- 
cgit v1.2.3


From ef4d04b87d16839500a77aa1279f80be7ec4ef2e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 12:58:53 +0100
Subject: Add empty Kbuild files for 'make headers_install' in remaining
 arches.

These include nothing more than the basic set of files listed in
asm-generic/Kbuild.asm. Any extra arch-specific files will need to be
added.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-arm/Kbuild       | 1 +
 include/asm-arm26/Kbuild     | 1 +
 include/asm-cris/Kbuild      | 1 +
 include/asm-frv/Kbuild       | 1 +
 include/asm-h8300/Kbuild     | 1 +
 include/asm-m32r/Kbuild      | 1 +
 include/asm-m68k/Kbuild      | 1 +
 include/asm-m68knommu/Kbuild | 1 +
 include/asm-mips/Kbuild      | 1 +
 include/asm-parisc/Kbuild    | 1 +
 include/asm-sh/Kbuild        | 1 +
 include/asm-sh64/Kbuild      | 1 +
 include/asm-um/Kbuild        | 1 +
 include/asm-v850/Kbuild      | 1 +
 include/asm-xtensa/Kbuild    | 1 +
 15 files changed, 15 insertions(+)
 create mode 100644 include/asm-arm/Kbuild
 create mode 100644 include/asm-arm26/Kbuild
 create mode 100644 include/asm-cris/Kbuild
 create mode 100644 include/asm-frv/Kbuild
 create mode 100644 include/asm-h8300/Kbuild
 create mode 100644 include/asm-m32r/Kbuild
 create mode 100644 include/asm-m68k/Kbuild
 create mode 100644 include/asm-m68knommu/Kbuild
 create mode 100644 include/asm-mips/Kbuild
 create mode 100644 include/asm-parisc/Kbuild
 create mode 100644 include/asm-sh/Kbuild
 create mode 100644 include/asm-sh64/Kbuild
 create mode 100644 include/asm-um/Kbuild
 create mode 100644 include/asm-v850/Kbuild
 create mode 100644 include/asm-xtensa/Kbuild

(limited to 'include')

diff --git a/include/asm-arm/Kbuild b/include/asm-arm/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-arm/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-arm26/Kbuild b/include/asm-arm26/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-arm26/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-cris/Kbuild b/include/asm-cris/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-cris/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-frv/Kbuild b/include/asm-frv/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-frv/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-h8300/Kbuild b/include/asm-h8300/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-h8300/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-m32r/Kbuild b/include/asm-m32r/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-m32r/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-m68k/Kbuild b/include/asm-m68k/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-m68k/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-m68knommu/Kbuild b/include/asm-m68knommu/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-m68knommu/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-mips/Kbuild b/include/asm-mips/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-mips/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-parisc/Kbuild b/include/asm-parisc/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-parisc/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-sh/Kbuild b/include/asm-sh/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-sh/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-sh64/Kbuild b/include/asm-sh64/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-sh64/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-um/Kbuild b/include/asm-um/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-um/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-v850/Kbuild b/include/asm-v850/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-v850/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
diff --git a/include/asm-xtensa/Kbuild b/include/asm-xtensa/Kbuild
new file mode 100644
index 000000000000..c68e1680da01
--- /dev/null
+++ b/include/asm-xtensa/Kbuild
@@ -0,0 +1 @@
+include include/asm-generic/Kbuild.asm
-- 
cgit v1.2.3


From 4d1a099828fb3ed52ff666fceb6a7d562d9048b8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 20 Jun 2006 08:34:40 +0100
Subject: Restrict headers exported to userspace for SPARC and SPARC64

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/asm-sparc/Kbuild   | 15 ++++-----------
 include/asm-sparc64/Kbuild | 16 +++++-----------
 2 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/asm-sparc/Kbuild b/include/asm-sparc/Kbuild
index cecdc6145546..e2a57fd7abfa 100644
--- a/include/asm-sparc/Kbuild
+++ b/include/asm-sparc/Kbuild
@@ -1,13 +1,6 @@
 include include/asm-generic/Kbuild.asm
 
-header-y += apc.h asi.h asmmacro.h auxio.h bitext.h bpp.h \
-	bsderrno.h btfixup.h clock.h contregs.h cpudata.h cypress.h \
-	ebus.h ecc.h eeprom.h fbio.h floppy.h head.h hw_irq.h \
-	idprom.h io-unit.h iommu.h ipc.h jsflash.h \
-	kdebug.h machines.h mbus.h memreg.h mostek.h mpmbox.h msi.h \
-	mxcc.h obio.h openprom.h openpromio.h oplib.h pbm.h pcic.h \
-	pconf.h perfctr.h pgtsrmmu.h pgtsun4.h pgtsun4c.h psr.h reg.h \
-	ross.h sbi.h sbus.h sfp-machine.h smpprim.h \
-	solerrno.h spinlock.h sun4paddr.h sun4prom.h sunbpp.h svr4.h \
-	swift.h sysen.h timer.h traps.h tsunami.h turbosparc.h \
-	vac-ops.h vaddrs.h vfc_ioctls.h viking.h winmacro.h
+unifdef-y += fbio.h perfctr.h psr.h
+header-y += apc.h asi.h auxio.h bpp.h head.h ipc.h jsflash.h	\
+	openpromio.h pbm.h pconf.h pgtsun4.h reg.h traps.h	\
+	turbosparc.h vfc_ioctls.h winmacro.h
diff --git a/include/asm-sparc64/Kbuild b/include/asm-sparc64/Kbuild
index 5edf7643a209..c78d44bb195f 100644
--- a/include/asm-sparc64/Kbuild
+++ b/include/asm-sparc64/Kbuild
@@ -1,16 +1,10 @@
 include include/asm-generic/Kbuild.asm
 
 ALTARCH := sparc
-ARCHDEF := defined __sparc__ && defined __arch64__ 
+ARCHDEF := defined __sparc__ && defined __arch64__
 ALTARCHDEF := defined __sparc__ && !defined __arch64__
 
-header-y += agp.h apb.h asi.h auxio.h bbc.h bpp.h bsderrno.h \
-	chafsr.h chmctrl.h compat.h const.h cpudata.h dcr.h dcu.h \
-	display7seg.h ebus.h envctrl.h estate.h fbio.h fhc.h floppy.h \
-	fpumacro.h head.h hw_irq.h idprom.h iommu.h \
-	ipc.h isa.h kdebug.h lsu.h mostek.h ns87303.h \
-	openprom.h openpromio.h oplib.h parport.h pbm.h pconf.h \
-	perfctr.h pil.h psrcompat.h pstate.h reg.h sbus.h \
-	sfp-machine.h solerrno.h spinlock.h spitfire.h starfire.h \
-	sunbpp.h svr4.h timer.h ttable.h uctx.h upa.h utrap.h vga.h \
-	visasm.h watchdog.h
+unifdef-y := fbio.h perfctr.h
+header-y += apb.h asi.h bbc.h bpp.h display7seg.h envctrl.h floppy.h	\
+	ipc.h kdebug.h mostek.h openprom.h openpromio.h parport.h	\
+	pconf.h psrcompat.h pstate.h reg.h uctx.h utrap.h watchdog.h
-- 
cgit v1.2.3


From 1bca9f2e5bd7f92fe4b34753e57bf4f47d3a2f01 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 22 Jun 2006 14:24:31 +0100
Subject: Remove <linux/i2c-id.h> and <linux/i2c-algo-ite.h> from userspace
 export

Do not export the following i2c header files to user space:

* i2c-id.h: These IDs are internal to the kernel and user space
  does not need them.
* i2c-algo-ite.h: The driver is broken and planned for removal, so
  let's not export it for now. The extra ioctl defined here can
  be standardized and moved to i2c-core/i2c-dev if really needed.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/Kbuild | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 5c95aa4dc0b4..f7252be57702 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -12,7 +12,7 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h	\
 	dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h	\
 	fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h	\
 	fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h	\
-	hpfs_fs.h hysdn_if.h i2c-dev.h i2c-id.h i8k.h icmp.h		\
+	hpfs_fs.h hysdn_if.h i2c-dev.h i8k.h icmp.h			\
 	if_arcnet.h if_arp.h if_bonding.h if_cablemodem.h if_fc.h	\
 	if_fddi.h if.h if_hippi.h if_infiniband.h if_packet.h		\
 	if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h	\
@@ -38,7 +38,7 @@ unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h	\
 	ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h		\
 	filter.h flat.h fs.h ftape.h gameport.h generic_serial.h	\
 	genhd.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h hiddev.h hpet.h	\
-	i2c-algo-ite.h i2c.h i2o-dev.h icmpv6.h if_bridge.h if_ec.h	\
+	i2c.h i2o-dev.h icmpv6.h if_bridge.h if_ec.h			\
 	if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h		\
 	if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h	\
 	inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h		\
-- 
cgit v1.2.3


From 03aba2f79594ca94d159c8bab454de9bcc385b76 Mon Sep 17 00:00:00 2001
From: Luben Tuikov <ltuikov@yahoo.com>
Date: Fri, 23 Jun 2006 09:39:09 -0700
Subject: [SCSI] sd/scsi_lib simplify sd_rw_intr and scsi_io_completion

This patch simplifies "good_bytes" computation in sd_rw_intr().
sd: "good_bytes" computation is always done in terms of the resolution
of the device's medium, since after that it is the number of good bytes
we pass around and other layers/contexts (as opposed ot sd) can translate
that to their own resolution (block layer:512).  It also makes
scsi_io_completion() processing more straightforward, eliminating the
3rd argument to the function.

It also fixes a couple of bugs like not checking return value,
using "break" instead of "return;", etc.

I've been running with this patch for some time now on a
test (do-it-all) system.

Signed-off-by: Luben Tuikov <ltuikov@yahoo.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_lib.c  | 108 ++++++++++++++---------------------
 drivers/scsi/sd.c        | 143 ++++++++++++++++++++---------------------------
 drivers/scsi/sr.c        |   2 +-
 include/scsi/scsi_cmnd.h |   2 +-
 4 files changed, 106 insertions(+), 149 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 3d04a9f386ac..4c4add53d69b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -855,8 +855,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
  *		b) We can just use scsi_requeue_command() here.  This would
  *		   be used if we just wanted to retry, for example.
  */
-void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
-			unsigned int block_bytes)
+void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 {
 	int result = cmd->result;
 	int this_count = cmd->bufflen;
@@ -921,87 +920,72 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
 	 * Next deal with any sectors which we were able to correctly
 	 * handle.
 	 */
-	if (good_bytes >= 0) {
-		SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, %d bytes done.\n",
+	if (good_bytes > 0) {
+		SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
+					      "%d bytes done.\n",
 					      req->nr_sectors, good_bytes));
 		SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
 
 		if (clear_errors)
 			req->errors = 0;
-		/*
-		 * If multiple sectors are requested in one buffer, then
-		 * they will have been finished off by the first command.
-		 * If not, then we have a multi-buffer command.
-		 *
-		 * If block_bytes != 0, it means we had a medium error
-		 * of some sort, and that we want to mark some number of
-		 * sectors as not uptodate.  Thus we want to inhibit
-		 * requeueing right here - we will requeue down below
-		 * when we handle the bad sectors.
-		 */
 
-		/*
-		 * If the command completed without error, then either
-		 * finish off the rest of the command, or start a new one.
+		/* A number of bytes were successfully read.  If there
+		 * is leftovers and there is some kind of error
+		 * (result != 0), retry the rest.
 		 */
-		if (scsi_end_request(cmd, 1, good_bytes, result == 0) == NULL)
+		if (scsi_end_request(cmd, 1, good_bytes, !!result) == NULL)
 			return;
 	}
-	/*
-	 * Now, if we were good little boys and girls, Santa left us a request
-	 * sense buffer.  We can extract information from this, so we
-	 * can choose a block to remap, etc.
+
+	/* good_bytes = 0, or (inclusive) there were leftovers and
+	 * result = 0, so scsi_end_request couldn't retry.
 	 */
 	if (sense_valid && !sense_deferred) {
 		switch (sshdr.sense_key) {
 		case UNIT_ATTENTION:
 			if (cmd->device->removable) {
-				/* detected disc change.  set a bit 
+				/* Detected disc change.  Set a bit
 				 * and quietly refuse further access.
 				 */
 				cmd->device->changed = 1;
-				scsi_end_request(cmd, 0,
-						this_count, 1);
+				scsi_end_request(cmd, 0, this_count, 1);
 				return;
 			} else {
-				/*
-				* Must have been a power glitch, or a
-				* bus reset.  Could not have been a
-				* media change, so we just retry the
-				* request and see what happens.  
-				*/
+				/* Must have been a power glitch, or a
+				 * bus reset.  Could not have been a
+				 * media change, so we just retry the
+				 * request and see what happens.
+				 */
 				scsi_requeue_command(q, cmd);
 				return;
 			}
 			break;
 		case ILLEGAL_REQUEST:
-			/*
-		 	* If we had an ILLEGAL REQUEST returned, then we may
-		 	* have performed an unsupported command.  The only
-		 	* thing this should be would be a ten byte read where
-			* only a six byte read was supported.  Also, on a
-			* system where READ CAPACITY failed, we may have read
-			* past the end of the disk.
-		 	*/
+			/* If we had an ILLEGAL REQUEST returned, then
+			 * we may have performed an unsupported
+			 * command.  The only thing this should be
+			 * would be a ten byte read where only a six
+			 * byte read was supported.  Also, on a system
+			 * where READ CAPACITY failed, we may have
+			 * read past the end of the disk.
+			 */
 			if ((cmd->device->use_10_for_rw &&
 			    sshdr.asc == 0x20 && sshdr.ascq == 0x00) &&
 			    (cmd->cmnd[0] == READ_10 ||
 			     cmd->cmnd[0] == WRITE_10)) {
 				cmd->device->use_10_for_rw = 0;
-				/*
-				 * This will cause a retry with a 6-byte
-				 * command.
+				/* This will cause a retry with a
+				 * 6-byte command.
 				 */
 				scsi_requeue_command(q, cmd);
-				result = 0;
+				return;
 			} else {
 				scsi_end_request(cmd, 0, this_count, 1);
 				return;
 			}
 			break;
 		case NOT_READY:
-			/*
-			 * If the device is in the process of becoming
+			/* If the device is in the process of becoming
 			 * ready, or has a temporary blockage, retry.
 			 */
 			if (sshdr.asc == 0x04) {
@@ -1021,7 +1005,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
 			}
 			if (!(req->flags & REQ_QUIET)) {
 				scmd_printk(KERN_INFO, cmd,
-					   "Device not ready: ");
+					    "Device not ready: ");
 				scsi_print_sense_hdr("", &sshdr);
 			}
 			scsi_end_request(cmd, 0, this_count, 1);
@@ -1029,21 +1013,21 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
 		case VOLUME_OVERFLOW:
 			if (!(req->flags & REQ_QUIET)) {
 				scmd_printk(KERN_INFO, cmd,
-					   "Volume overflow, CDB: ");
+					    "Volume overflow, CDB: ");
 				__scsi_print_command(cmd->data_cmnd);
 				scsi_print_sense("", cmd);
 			}
-			scsi_end_request(cmd, 0, block_bytes, 1);
+			/* See SSC3rXX or current. */
+			scsi_end_request(cmd, 0, this_count, 1);
 			return;
 		default:
 			break;
 		}
-	}			/* driver byte != 0 */
+	}
 	if (host_byte(result) == DID_RESET) {
-		/*
-		 * Third party bus reset or reset for error
-		 * recovery reasons.  Just retry the request
-		 * and see what happens.  
+		/* Third party bus reset or reset for error recovery
+		 * reasons.  Just retry the request and see what
+		 * happens.
 		 */
 		scsi_requeue_command(q, cmd);
 		return;
@@ -1051,21 +1035,13 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
 	if (result) {
 		if (!(req->flags & REQ_QUIET)) {
 			scmd_printk(KERN_INFO, cmd,
-				   "SCSI error: return code = 0x%x\n", result);
-
+				    "SCSI error: return code = 0x%08x\n",
+				    result);
 			if (driver_byte(result) & DRIVER_SENSE)
 				scsi_print_sense("", cmd);
 		}
-		/*
-		 * Mark a single buffer as not uptodate.  Queue the remainder.
-		 * We sometimes get this cruft in the event that a medium error
-		 * isn't properly reported.
-		 */
-		block_bytes = req->hard_cur_sectors << 9;
-		if (!block_bytes)
-			block_bytes = req->data_len;
-		scsi_end_request(cmd, 0, block_bytes, 1);
 	}
+	scsi_end_request(cmd, 0, this_count, !result);
 }
 EXPORT_SYMBOL(scsi_io_completion);
 
@@ -1169,7 +1145,7 @@ static void scsi_blk_pc_done(struct scsi_cmnd *cmd)
 	 * successfully. Since this is a REQ_BLOCK_PC command the
 	 * caller should check the request's errors value
 	 */
-	scsi_io_completion(cmd, cmd->bufflen, 0);
+	scsi_io_completion(cmd, cmd->bufflen);
 }
 
 static void scsi_setup_blk_pc_cmnd(struct scsi_cmnd *cmd)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 354199011246..f899ff0cf005 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -891,11 +891,10 @@ static struct block_device_operations sd_fops = {
 static void sd_rw_intr(struct scsi_cmnd * SCpnt)
 {
 	int result = SCpnt->result;
-	int this_count = SCpnt->request_bufflen;
-	int good_bytes = (result == 0 ? this_count : 0);
-	sector_t block_sectors = 1;
-	u64 first_err_block;
-	sector_t error_sector;
+ 	unsigned int xfer_size = SCpnt->request_bufflen;
+ 	unsigned int good_bytes = result ? 0 : xfer_size;
+ 	u64 start_lba = SCpnt->request->sector;
+ 	u64 bad_lba;
 	struct scsi_sense_hdr sshdr;
 	int sense_valid = 0;
 	int sense_deferred = 0;
@@ -906,7 +905,6 @@ static void sd_rw_intr(struct scsi_cmnd * SCpnt)
 		if (sense_valid)
 			sense_deferred = scsi_sense_is_deferred(&sshdr);
 	}
-
 #ifdef CONFIG_SCSI_LOGGING
 	SCSI_LOG_HLCOMPLETE(1, printk("sd_rw_intr: %s: res=0x%x\n", 
 				SCpnt->request->rq_disk->disk_name, result));
@@ -916,89 +914,72 @@ static void sd_rw_intr(struct scsi_cmnd * SCpnt)
 				sshdr.sense_key, sshdr.asc, sshdr.ascq));
 	}
 #endif
-	/*
-	   Handle MEDIUM ERRORs that indicate partial success.  Since this is a
-	   relatively rare error condition, no care is taken to avoid
-	   unnecessary additional work such as memcpy's that could be avoided.
-	 */
-	if (driver_byte(result) != 0 &&
-		 sense_valid && !sense_deferred) {
-		switch (sshdr.sense_key) {
-		case MEDIUM_ERROR:
-			if (!blk_fs_request(SCpnt->request))
-				break;
-			info_valid = scsi_get_sense_info_fld(
-				SCpnt->sense_buffer, SCSI_SENSE_BUFFERSIZE,
-				&first_err_block);
-			/*
-			 * May want to warn and skip if following cast results
-			 * in actual truncation (if sector_t < 64 bits)
-			 */
-			error_sector = (sector_t)first_err_block;
-			if (SCpnt->request->bio != NULL)
-				block_sectors = bio_sectors(SCpnt->request->bio);
-			switch (SCpnt->device->sector_size) {
-			case 1024:
-				error_sector <<= 1;
-				if (block_sectors < 2)
-					block_sectors = 2;
-				break;
-			case 2048:
-				error_sector <<= 2;
-				if (block_sectors < 4)
-					block_sectors = 4;
-				break;
-			case 4096:
-				error_sector <<=3;
-				if (block_sectors < 8)
-					block_sectors = 8;
-				break;
-			case 256:
-				error_sector >>= 1;
-				break;
-			default:
-				break;
-			}
+	if (driver_byte(result) != DRIVER_SENSE &&
+	    (!sense_valid || sense_deferred))
+		goto out;
 
-			error_sector &= ~(block_sectors - 1);
-			good_bytes = (error_sector - SCpnt->request->sector) << 9;
-			if (good_bytes < 0 || good_bytes >= this_count)
-				good_bytes = 0;
+	switch (sshdr.sense_key) {
+	case HARDWARE_ERROR:
+	case MEDIUM_ERROR:
+		if (!blk_fs_request(SCpnt->request))
+			goto out;
+		info_valid = scsi_get_sense_info_fld(SCpnt->sense_buffer,
+						     SCSI_SENSE_BUFFERSIZE,
+						     &bad_lba);
+		if (!info_valid)
+			goto out;
+		if (xfer_size <= SCpnt->device->sector_size)
+			goto out;
+		switch (SCpnt->device->sector_size) {
+		case 256:
+			start_lba <<= 1;
 			break;
-
-		case RECOVERED_ERROR: /* an error occurred, but it recovered */
-		case NO_SENSE: /* LLDD got sense data */
-			/*
-			 * Inform the user, but make sure that it's not treated
-			 * as a hard error.
-			 */
-			scsi_print_sense("sd", SCpnt);
-			SCpnt->result = 0;
-			memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
-			good_bytes = this_count;
+		case 512:
 			break;
-
-		case ILLEGAL_REQUEST:
-			if (SCpnt->device->use_10_for_rw &&
-			    (SCpnt->cmnd[0] == READ_10 ||
-			     SCpnt->cmnd[0] == WRITE_10))
-				SCpnt->device->use_10_for_rw = 0;
-			if (SCpnt->device->use_10_for_ms &&
-			    (SCpnt->cmnd[0] == MODE_SENSE_10 ||
-			     SCpnt->cmnd[0] == MODE_SELECT_10))
-				SCpnt->device->use_10_for_ms = 0;
+		case 1024:
+			start_lba >>= 1;
+			break;
+		case 2048:
+			start_lba >>= 2;
+			break;
+		case 4096:
+			start_lba >>= 3;
 			break;
-
 		default:
+			/* Print something here with limiting frequency. */
+			goto out;
 			break;
 		}
+		/* This computation should always be done in terms of
+		 * the resolution of the device's medium.
+		 */
+		good_bytes = (bad_lba - start_lba)*SCpnt->device->sector_size;
+		break;
+	case RECOVERED_ERROR:
+	case NO_SENSE:
+		/* Inform the user, but make sure that it's not treated
+		 * as a hard error.
+		 */
+		scsi_print_sense("sd", SCpnt);
+		SCpnt->result = 0;
+		memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
+		good_bytes = xfer_size;
+		break;
+	case ILLEGAL_REQUEST:
+		if (SCpnt->device->use_10_for_rw &&
+		    (SCpnt->cmnd[0] == READ_10 ||
+		     SCpnt->cmnd[0] == WRITE_10))
+			SCpnt->device->use_10_for_rw = 0;
+		if (SCpnt->device->use_10_for_ms &&
+		    (SCpnt->cmnd[0] == MODE_SENSE_10 ||
+		     SCpnt->cmnd[0] == MODE_SELECT_10))
+			SCpnt->device->use_10_for_ms = 0;
+		break;
+	default:
+		break;
 	}
-	/*
-	 * This calls the generic completion function, now that we know
-	 * how many actual sectors finished, and how many sectors we need
-	 * to say have failed.
-	 */
-	scsi_io_completion(SCpnt, good_bytes, block_sectors << 9);
+ out:
+	scsi_io_completion(SCpnt, good_bytes);
 }
 
 static int media_not_present(struct scsi_disk *sdkp,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index ebf6579ed698..fd94408577e5 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -292,7 +292,7 @@ static void rw_intr(struct scsi_cmnd * SCpnt)
 	 * how many actual sectors finished, and how many sectors we need
 	 * to say have failed.
 	 */
-	scsi_io_completion(SCpnt, good_bytes, block_sectors << 9);
+	scsi_io_completion(SCpnt, good_bytes);
 }
 
 static int sr_init_command(struct scsi_cmnd * SCpnt)
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index e46cd404bd7d..371f70d9aa92 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -143,7 +143,7 @@ struct scsi_cmnd {
 
 extern struct scsi_cmnd *scsi_get_command(struct scsi_device *, gfp_t);
 extern void scsi_put_command(struct scsi_cmnd *);
-extern void scsi_io_completion(struct scsi_cmnd *, unsigned int, unsigned int);
+extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
 extern void scsi_finish_command(struct scsi_cmnd *cmd);
 extern void scsi_req_abort_cmd(struct scsi_cmnd *cmd);
 
-- 
cgit v1.2.3


From d7a1bb0a04ca835bffc0a91e64ab827dfba7d8f5 Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Wed, 8 Mar 2006 14:50:12 -0500
Subject: [SCSI] Block I/O while SG reset operation in progress - the midlayer
 patch

The scsi midlayer portion of the patch

Signed-off-by: James Smart <James.Smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_error.c | 22 ++++++++++++++++++++++
 include/scsi/scsi_host.h  |  6 +++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 6a7a60fc0a4e..6683d596234a 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1672,7 +1672,9 @@ int
 scsi_reset_provider(struct scsi_device *dev, int flag)
 {
 	struct scsi_cmnd *scmd = scsi_get_command(dev, GFP_KERNEL);
+	struct Scsi_Host *shost = dev->host;
 	struct request req;
+	unsigned long flags;
 	int rtn;
 
 	scmd->request = &req;
@@ -1699,6 +1701,10 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
 	 */
 	scmd->pid			= 0;
 
+	spin_lock_irqsave(shost->host_lock, flags);
+	shost->tmf_in_progress = 1;
+	spin_unlock_irqrestore(shost->host_lock, flags);
+
 	switch (flag) {
 	case SCSI_TRY_RESET_DEVICE:
 		rtn = scsi_try_bus_device_reset(scmd);
@@ -1717,6 +1723,22 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
 		rtn = FAILED;
 	}
 
+	spin_lock_irqsave(shost->host_lock, flags);
+	shost->tmf_in_progress = 0;
+	spin_unlock_irqrestore(shost->host_lock, flags);
+
+	/*
+	 * be sure to wake up anyone who was sleeping or had their queue
+	 * suspended while we performed the TMF.
+	 */
+	SCSI_LOG_ERROR_RECOVERY(3,
+		printk("%s: waking up host to restart after TMF\n",
+		__FUNCTION__));
+
+	wake_up(&shost->host_wait);
+
+	scsi_run_host_queues(shost);
+
 	scsi_next_command(scmd);
 	return rtn;
 }
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index a42efd6e4be8..b3dd90f3e858 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -542,6 +542,9 @@ struct Scsi_Host {
 	 */
 	unsigned ordered_tag:1;
 
+	/* task mgmt function in progress */
+	unsigned tmf_in_progress:1;
+
 	/*
 	 * Optional work queue to be utilized by the transport
 	 */
@@ -619,7 +622,8 @@ static inline int scsi_host_in_recovery(struct Scsi_Host *shost)
 {
 	return shost->shost_state == SHOST_RECOVERY ||
 		shost->shost_state == SHOST_CANCEL_RECOVERY ||
-		shost->shost_state == SHOST_DEL_RECOVERY;
+		shost->shost_state == SHOST_DEL_RECOVERY ||
+		shost->tmf_in_progress;
 }
 
 extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
-- 
cgit v1.2.3


From 65c92b09acf0218b64f1c7ba4fdabeb8b732c876 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Wed, 28 Jun 2006 12:22:50 -0400
Subject: [SCSI] scsi_transport_sas: introduce a sas_port entity

this patch introduces a port object, separates out ports and phys,
with ports becoming the primary objects of the tree.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_sas_internal.h  |  10 +-
 drivers/scsi/scsi_transport_sas.c | 371 ++++++++++++++++++++++++++++++++++----
 include/scsi/scsi_transport_sas.h |  37 +++-
 3 files changed, 372 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_sas_internal.h b/drivers/scsi/scsi_sas_internal.h
index d76e6e3d8ca5..e1edab45a37b 100644
--- a/drivers/scsi/scsi_sas_internal.h
+++ b/drivers/scsi/scsi_sas_internal.h
@@ -2,7 +2,8 @@
 #define _SCSI_SAS_INTERNAL_H
 
 #define SAS_HOST_ATTRS		0
-#define SAS_PORT_ATTRS		17
+#define SAS_PHY_ATTRS		17
+#define SAS_PORT_ATTRS		1
 #define SAS_RPORT_ATTRS		7
 #define SAS_END_DEV_ATTRS	3
 #define SAS_EXPANDER_ATTRS	7
@@ -13,12 +14,14 @@ struct sas_internal {
 	struct sas_domain_function_template *dft;
 
 	struct class_device_attribute private_host_attrs[SAS_HOST_ATTRS];
-	struct class_device_attribute private_phy_attrs[SAS_PORT_ATTRS];
+	struct class_device_attribute private_phy_attrs[SAS_PHY_ATTRS];
+	struct class_device_attribute private_port_attrs[SAS_PORT_ATTRS];
 	struct class_device_attribute private_rphy_attrs[SAS_RPORT_ATTRS];
 	struct class_device_attribute private_end_dev_attrs[SAS_END_DEV_ATTRS];
 	struct class_device_attribute private_expander_attrs[SAS_EXPANDER_ATTRS];
 
 	struct transport_container phy_attr_cont;
+	struct transport_container port_attr_cont;
 	struct transport_container rphy_attr_cont;
 	struct transport_container end_dev_attr_cont;
 	struct transport_container expander_attr_cont;
@@ -28,7 +31,8 @@ struct sas_internal {
 	 * needed by scsi_sysfs.c
 	 */
 	struct class_device_attribute *host_attrs[SAS_HOST_ATTRS + 1];
-	struct class_device_attribute *phy_attrs[SAS_PORT_ATTRS + 1];
+	struct class_device_attribute *phy_attrs[SAS_PHY_ATTRS + 1];
+	struct class_device_attribute *port_attrs[SAS_PORT_ATTRS + 1];
 	struct class_device_attribute *rphy_attrs[SAS_RPORT_ATTRS + 1];
 	struct class_device_attribute *end_dev_attrs[SAS_END_DEV_ATTRS + 1];
 	struct class_device_attribute *expander_attrs[SAS_EXPANDER_ATTRS + 1];
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 1fe6b2d01853..dd075627e605 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -174,11 +174,28 @@ static int sas_host_match(struct attribute_container *cont,
 
 static int do_sas_phy_delete(struct device *dev, void *data)
 {
-	if (scsi_is_sas_phy(dev))
+	int pass = (int)(unsigned long)data;
+
+	if (pass == 0 && scsi_is_sas_port(dev))
+		sas_port_delete(dev_to_sas_port(dev));
+	else if (pass == 1 && scsi_is_sas_phy(dev))
 		sas_phy_delete(dev_to_phy(dev));
 	return 0;
 }
 
+/**
+ * sas_remove_children  --  tear down a devices SAS data structures
+ * @dev:	device belonging to the sas object
+ *
+ * Removes all SAS PHYs and remote PHYs for a given object
+ */
+void sas_remove_children(struct device *dev)
+{
+	device_for_each_child(dev, (void *)0, do_sas_phy_delete);
+	device_for_each_child(dev, (void *)1, do_sas_phy_delete);
+}
+EXPORT_SYMBOL(sas_remove_children);
+
 /**
  * sas_remove_host  --  tear down a Scsi_Host's SAS data structures
  * @shost:	Scsi Host that is torn down
@@ -188,13 +205,13 @@ static int do_sas_phy_delete(struct device *dev, void *data)
  */
 void sas_remove_host(struct Scsi_Host *shost)
 {
-	device_for_each_child(&shost->shost_gendev, NULL, do_sas_phy_delete);
+	sas_remove_children(&shost->shost_gendev);
 }
 EXPORT_SYMBOL(sas_remove_host);
 
 
 /*
- * SAS Port attributes
+ * SAS Phy attributes
  */
 
 #define sas_phy_show_simple(field, name, format_string, cast)		\
@@ -310,7 +327,7 @@ sas_phy_protocol_attr(identify.target_port_protocols,
 sas_phy_simple_attr(identify.sas_address, sas_address, "0x%016llx\n",
 		unsigned long long);
 sas_phy_simple_attr(identify.phy_identifier, phy_identifier, "%d\n", u8);
-sas_phy_simple_attr(port_identifier, port_identifier, "%d\n", u8);
+//sas_phy_simple_attr(port_identifier, port_identifier, "%d\n", u8);
 sas_phy_linkspeed_attr(negotiated_linkrate);
 sas_phy_linkspeed_attr(minimum_linkrate_hw);
 sas_phy_linkspeed_attr(minimum_linkrate);
@@ -378,9 +395,10 @@ struct sas_phy *sas_phy_alloc(struct device *parent, int number)
 	device_initialize(&phy->dev);
 	phy->dev.parent = get_device(parent);
 	phy->dev.release = sas_phy_release;
+	INIT_LIST_HEAD(&phy->port_siblings);
 	if (scsi_is_sas_expander_device(parent)) {
 		struct sas_rphy *rphy = dev_to_rphy(parent);
-		sprintf(phy->dev.bus_id, "phy-%d-%d:%d", shost->host_no,
+		sprintf(phy->dev.bus_id, "phy-%d:%d:%d", shost->host_no,
 			rphy->scsi_target_id, number);
 	} else
 		sprintf(phy->dev.bus_id, "phy-%d:%d", shost->host_no, number);
@@ -440,8 +458,8 @@ sas_phy_delete(struct sas_phy *phy)
 {
 	struct device *dev = &phy->dev;
 
-	if (phy->rphy)
-		sas_rphy_delete(phy->rphy);
+	/* this happens if the phy is still part of a port when deleted */
+	BUG_ON(!list_empty(&phy->port_siblings));
 
 	transport_remove_device(dev);
 	device_del(dev);
@@ -463,6 +481,258 @@ int scsi_is_sas_phy(const struct device *dev)
 }
 EXPORT_SYMBOL(scsi_is_sas_phy);
 
+/*
+ * SAS Port attributes
+ */
+#define sas_port_show_simple(field, name, format_string, cast)		\
+static ssize_t								\
+show_sas_port_##name(struct class_device *cdev, char *buf)		\
+{									\
+	struct sas_port *port = transport_class_to_sas_port(cdev);	\
+									\
+	return snprintf(buf, 20, format_string, cast port->field);	\
+}
+
+#define sas_port_simple_attr(field, name, format_string, type)		\
+	sas_port_show_simple(field, name, format_string, (type))	\
+static CLASS_DEVICE_ATTR(name, S_IRUGO, show_sas_port_##name, NULL)
+
+sas_port_simple_attr(num_phys, num_phys, "%d\n", int);
+
+static DECLARE_TRANSPORT_CLASS(sas_port_class,
+			       "sas_port", NULL, NULL, NULL);
+
+static int sas_port_match(struct attribute_container *cont, struct device *dev)
+{
+	struct Scsi_Host *shost;
+	struct sas_internal *i;
+
+	if (!scsi_is_sas_port(dev))
+		return 0;
+	shost = dev_to_shost(dev->parent);
+
+	if (!shost->transportt)
+		return 0;
+	if (shost->transportt->host_attrs.ac.class !=
+			&sas_host_class.class)
+		return 0;
+
+	i = to_sas_internal(shost->transportt);
+	return &i->port_attr_cont.ac == cont;
+}
+
+
+static void sas_port_release(struct device *dev)
+{
+	struct sas_port *port = dev_to_sas_port(dev);
+
+	BUG_ON(!list_empty(&port->phy_list));
+
+	put_device(dev->parent);
+	kfree(port);
+}
+
+static void sas_port_create_link(struct sas_port *port,
+				 struct sas_phy *phy)
+{
+	sysfs_create_link(&port->dev.kobj, &phy->dev.kobj, phy->dev.bus_id);
+	sysfs_create_link(&phy->dev.kobj, &port->dev.kobj, "port");
+}
+
+static void sas_port_delete_link(struct sas_port *port,
+				 struct sas_phy *phy)
+{
+	sysfs_remove_link(&port->dev.kobj, phy->dev.bus_id);
+	sysfs_remove_link(&phy->dev.kobj, "port");
+}
+
+/** sas_port_alloc - allocate and initialize a SAS port structure
+ *
+ * @parent:	parent device
+ * @port_id:	port number
+ *
+ * Allocates a SAS port structure.  It will be added to the device tree
+ * below the device specified by @parent which must be either a Scsi_Host
+ * or a sas_expander_device.
+ *
+ * Returns %NULL on error
+ */
+struct sas_port *sas_port_alloc(struct device *parent, int port_id)
+{
+	struct Scsi_Host *shost = dev_to_shost(parent);
+	struct sas_port *port;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return NULL;
+
+	port->port_identifier = port_id;
+
+	device_initialize(&port->dev);
+
+	port->dev.parent = get_device(parent);
+	port->dev.release = sas_port_release;
+
+	mutex_init(&port->phy_list_mutex);
+	INIT_LIST_HEAD(&port->phy_list);
+
+	if (scsi_is_sas_expander_device(parent)) {
+		struct sas_rphy *rphy = dev_to_rphy(parent);
+		sprintf(port->dev.bus_id, "port-%d:%d:%d", shost->host_no,
+			rphy->scsi_target_id, port->port_identifier);
+	} else
+		sprintf(port->dev.bus_id, "port-%d:%d", shost->host_no,
+			port->port_identifier);
+
+	transport_setup_device(&port->dev);
+
+	return port;
+}
+EXPORT_SYMBOL(sas_port_alloc);
+
+/**
+ * sas_port_add - add a SAS port to the device hierarchy
+ *
+ * @port:	port to be added
+ *
+ * publishes a port to the rest of the system
+ */
+int sas_port_add(struct sas_port *port)
+{
+	int error;
+
+	/* No phys should be added until this is made visible */
+	BUG_ON(!list_empty(&port->phy_list));
+
+	error = device_add(&port->dev);
+
+	if (error)
+		return error;
+
+	transport_add_device(&port->dev);
+	transport_configure_device(&port->dev);
+
+	return 0;
+}
+EXPORT_SYMBOL(sas_port_add);
+
+/**
+ * sas_port_free  --  free a SAS PORT
+ * @port:	SAS PORT to free
+ *
+ * Frees the specified SAS PORT.
+ *
+ * Note:
+ *   This function must only be called on a PORT that has not
+ *   sucessfully been added using sas_port_add().
+ */
+void sas_port_free(struct sas_port *port)
+{
+	transport_destroy_device(&port->dev);
+	put_device(&port->dev);
+}
+EXPORT_SYMBOL(sas_port_free);
+
+/**
+ * sas_port_delete  --  remove SAS PORT
+ * @port:	SAS PORT to remove
+ *
+ * Removes the specified SAS PORT.  If the SAS PORT has an
+ * associated phys, unlink them from the port as well.
+ */
+void sas_port_delete(struct sas_port *port)
+{
+	struct device *dev = &port->dev;
+	struct sas_phy *phy, *tmp_phy;
+
+	if (port->rphy) {
+		sas_rphy_delete(port->rphy);
+		port->rphy = NULL;
+	}
+
+	mutex_lock(&port->phy_list_mutex);
+	list_for_each_entry_safe(phy, tmp_phy, &port->phy_list,
+				 port_siblings) {
+		sas_port_delete_link(port, phy);
+		list_del_init(&phy->port_siblings);
+	}
+	mutex_unlock(&port->phy_list_mutex);
+
+	transport_remove_device(dev);
+	device_del(dev);
+	transport_destroy_device(dev);
+	put_device(dev);
+}
+EXPORT_SYMBOL(sas_port_delete);
+
+/**
+ * scsi_is_sas_port --  check if a struct device represents a SAS port
+ * @dev:	device to check
+ *
+ * Returns:
+ *	%1 if the device represents a SAS Port, %0 else
+ */
+int scsi_is_sas_port(const struct device *dev)
+{
+	return dev->release == sas_port_release;
+}
+EXPORT_SYMBOL(scsi_is_sas_port);
+
+/**
+ * sas_port_add_phy - add another phy to a port to form a wide port
+ * @port:	port to add the phy to
+ * @phy:	phy to add
+ *
+ * When a port is initially created, it is empty (has no phys).  All
+ * ports must have at least one phy to operated, and all wide ports
+ * must have at least two.  The current code makes no difference
+ * between ports and wide ports, but the only object that can be
+ * connected to a remote device is a port, so ports must be formed on
+ * all devices with phys if they're connected to anything.
+ */
+void sas_port_add_phy(struct sas_port *port, struct sas_phy *phy)
+{
+	mutex_lock(&port->phy_list_mutex);
+	if (unlikely(!list_empty(&phy->port_siblings))) {
+		/* make sure we're already on this port */
+		struct sas_phy *tmp;
+
+		list_for_each_entry(tmp, &port->phy_list, port_siblings)
+			if (tmp == phy)
+				break;
+		/* If this trips, you added a phy that was already
+		 * part of a different port */
+		if (unlikely(tmp != phy)) {
+			dev_printk(KERN_ERR, &port->dev, "trying to add phy %s fails: it's already part of another port\n", phy->dev.bus_id);
+			BUG();
+		}
+	} else {
+		sas_port_create_link(port, phy);
+		list_add_tail(&phy->port_siblings, &port->phy_list);
+		port->num_phys++;
+	}
+	mutex_unlock(&port->phy_list_mutex);
+}
+EXPORT_SYMBOL(sas_port_add_phy);
+
+/**
+ * sas_port_delete_phy - remove a phy from a port or wide port
+ * @port:	port to remove the phy from
+ * @phy:	phy to remove
+ *
+ * This operation is used for tearing down ports again.  It must be
+ * done to every port or wide port before calling sas_port_delete.
+ */
+void sas_port_delete_phy(struct sas_port *port, struct sas_phy *phy)
+{
+	mutex_lock(&port->phy_list_mutex);
+	sas_port_delete_link(port, phy);
+	list_del_init(&phy->port_siblings);
+	port->num_phys--;
+	mutex_unlock(&port->phy_list_mutex);
+}
+EXPORT_SYMBOL(sas_port_delete_phy);
+
 /*
  * SAS remote PHY attributes.
  */
@@ -767,7 +1037,7 @@ static void sas_rphy_initialize(struct sas_rphy *rphy)
  * Returns:
  *	SAS PHY allocated or %NULL if the allocation failed.
  */
-struct sas_rphy *sas_end_device_alloc(struct sas_phy *parent)
+struct sas_rphy *sas_end_device_alloc(struct sas_port *parent)
 {
 	struct Scsi_Host *shost = dev_to_shost(&parent->dev);
 	struct sas_end_device *rdev;
@@ -780,8 +1050,13 @@ struct sas_rphy *sas_end_device_alloc(struct sas_phy *parent)
 	device_initialize(&rdev->rphy.dev);
 	rdev->rphy.dev.parent = get_device(&parent->dev);
 	rdev->rphy.dev.release = sas_end_device_release;
-	sprintf(rdev->rphy.dev.bus_id, "end_device-%d:%d-%d",
-		shost->host_no, parent->port_identifier, parent->number);
+	if (scsi_is_sas_expander_device(parent->dev.parent)) {
+		struct sas_rphy *rphy = dev_to_rphy(parent->dev.parent);
+		sprintf(rdev->rphy.dev.bus_id, "end_device-%d:%d:%d",
+			shost->host_no, rphy->scsi_target_id, parent->port_identifier);
+	} else
+		sprintf(rdev->rphy.dev.bus_id, "end_device-%d:%d",
+			shost->host_no, parent->port_identifier);
 	rdev->rphy.identify.device_type = SAS_END_DEVICE;
 	sas_rphy_initialize(&rdev->rphy);
 	transport_setup_device(&rdev->rphy.dev);
@@ -798,7 +1073,7 @@ EXPORT_SYMBOL(sas_end_device_alloc);
  * Returns:
  *	SAS PHY allocated or %NULL if the allocation failed.
  */
-struct sas_rphy *sas_expander_alloc(struct sas_phy *parent,
+struct sas_rphy *sas_expander_alloc(struct sas_port *parent,
 				    enum sas_device_type type)
 {
 	struct Scsi_Host *shost = dev_to_shost(&parent->dev);
@@ -837,7 +1112,7 @@ EXPORT_SYMBOL(sas_expander_alloc);
  */
 int sas_rphy_add(struct sas_rphy *rphy)
 {
-	struct sas_phy *parent = dev_to_phy(rphy->dev.parent);
+	struct sas_port *parent = dev_to_sas_port(rphy->dev.parent);
 	struct Scsi_Host *shost = dev_to_shost(parent->dev.parent);
 	struct sas_host_attrs *sas_host = to_sas_host_attrs(shost);
 	struct sas_identify *identify = &rphy->identify;
@@ -910,7 +1185,7 @@ void
 sas_rphy_delete(struct sas_rphy *rphy)
 {
 	struct device *dev = &rphy->dev;
-	struct sas_phy *parent = dev_to_phy(dev->parent);
+	struct sas_port *parent = dev_to_sas_port(dev->parent);
 	struct Scsi_Host *shost = dev_to_shost(parent->dev.parent);
 	struct sas_host_attrs *sas_host = to_sas_host_attrs(shost);
 
@@ -920,7 +1195,7 @@ sas_rphy_delete(struct sas_rphy *rphy)
 		break;
 	case SAS_EDGE_EXPANDER_DEVICE:
 	case SAS_FANOUT_EXPANDER_DEVICE:
-		device_for_each_child(dev, NULL, do_sas_phy_delete);
+		sas_remove_children(dev);
 		break;
 	default:
 		break;
@@ -967,7 +1242,7 @@ static int sas_user_scan(struct Scsi_Host *shost, uint channel,
 
 	mutex_lock(&sas_host->lock);
 	list_for_each_entry(rphy, &sas_host->rphy_list, list) {
-		struct sas_phy *parent = dev_to_phy(rphy->dev.parent);
+		struct sas_port *parent = dev_to_sas_port(rphy->dev.parent);
 
 		if (rphy->identify.device_type != SAS_END_DEVICE ||
 		    rphy->scsi_target_id == -1)
@@ -1003,16 +1278,19 @@ static int sas_user_scan(struct Scsi_Host *shost, uint channel,
 #define SETUP_OPTIONAL_RPORT_ATTRIBUTE(field, func)			\
 	SETUP_TEMPLATE(rphy_attrs, field, S_IRUGO, i->f->func)
 
-#define SETUP_PORT_ATTRIBUTE(field)					\
+#define SETUP_PHY_ATTRIBUTE(field)					\
 	SETUP_TEMPLATE(phy_attrs, field, S_IRUGO, 1)
 
-#define SETUP_OPTIONAL_PORT_ATTRIBUTE(field, func)			\
+#define SETUP_PORT_ATTRIBUTE(field)					\
+	SETUP_TEMPLATE(port_attrs, field, S_IRUGO, 1)
+
+#define SETUP_OPTIONAL_PHY_ATTRIBUTE(field, func)			\
 	SETUP_TEMPLATE(phy_attrs, field, S_IRUGO, i->f->func)
 
-#define SETUP_PORT_ATTRIBUTE_WRONLY(field)				\
+#define SETUP_PHY_ATTRIBUTE_WRONLY(field)				\
 	SETUP_TEMPLATE(phy_attrs, field, S_IWUGO, 1)
 
-#define SETUP_OPTIONAL_PORT_ATTRIBUTE_WRONLY(field, func)		\
+#define SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(field, func)		\
 	SETUP_TEMPLATE(phy_attrs, field, S_IWUGO, i->f->func)
 
 #define SETUP_END_DEV_ATTRIBUTE(field)					\
@@ -1048,6 +1326,11 @@ sas_attach_transport(struct sas_function_template *ft)
 	i->phy_attr_cont.ac.match = sas_phy_match;
 	transport_container_register(&i->phy_attr_cont);
 
+	i->port_attr_cont.ac.class = &sas_port_class.class;
+	i->port_attr_cont.ac.attrs = &i->port_attrs[0];
+	i->port_attr_cont.ac.match = sas_port_match;
+	transport_container_register(&i->port_attr_cont);
+
 	i->rphy_attr_cont.ac.class = &sas_rphy_class.class;
 	i->rphy_attr_cont.ac.attrs = &i->rphy_attrs[0];
 	i->rphy_attr_cont.ac.match = sas_rphy_match;
@@ -1066,29 +1349,34 @@ sas_attach_transport(struct sas_function_template *ft)
 	i->f = ft;
 
 	count = 0;
+	SETUP_PORT_ATTRIBUTE(num_phys);
 	i->host_attrs[count] = NULL;
 
 	count = 0;
-	SETUP_PORT_ATTRIBUTE(initiator_port_protocols);
-	SETUP_PORT_ATTRIBUTE(target_port_protocols);
-	SETUP_PORT_ATTRIBUTE(device_type);
-	SETUP_PORT_ATTRIBUTE(sas_address);
-	SETUP_PORT_ATTRIBUTE(phy_identifier);
-	SETUP_PORT_ATTRIBUTE(port_identifier);
-	SETUP_PORT_ATTRIBUTE(negotiated_linkrate);
-	SETUP_PORT_ATTRIBUTE(minimum_linkrate_hw);
-	SETUP_PORT_ATTRIBUTE(minimum_linkrate);
-	SETUP_PORT_ATTRIBUTE(maximum_linkrate_hw);
-	SETUP_PORT_ATTRIBUTE(maximum_linkrate);
-
-	SETUP_PORT_ATTRIBUTE(invalid_dword_count);
-	SETUP_PORT_ATTRIBUTE(running_disparity_error_count);
-	SETUP_PORT_ATTRIBUTE(loss_of_dword_sync_count);
-	SETUP_PORT_ATTRIBUTE(phy_reset_problem_count);
-	SETUP_OPTIONAL_PORT_ATTRIBUTE_WRONLY(link_reset, phy_reset);
-	SETUP_OPTIONAL_PORT_ATTRIBUTE_WRONLY(hard_reset, phy_reset);
+	SETUP_PHY_ATTRIBUTE(initiator_port_protocols);
+	SETUP_PHY_ATTRIBUTE(target_port_protocols);
+	SETUP_PHY_ATTRIBUTE(device_type);
+	SETUP_PHY_ATTRIBUTE(sas_address);
+	SETUP_PHY_ATTRIBUTE(phy_identifier);
+	//SETUP_PHY_ATTRIBUTE(port_identifier);
+	SETUP_PHY_ATTRIBUTE(negotiated_linkrate);
+	SETUP_PHY_ATTRIBUTE(minimum_linkrate_hw);
+	SETUP_PHY_ATTRIBUTE(minimum_linkrate);
+	SETUP_PHY_ATTRIBUTE(maximum_linkrate_hw);
+	SETUP_PHY_ATTRIBUTE(maximum_linkrate);
+
+	SETUP_PHY_ATTRIBUTE(invalid_dword_count);
+	SETUP_PHY_ATTRIBUTE(running_disparity_error_count);
+	SETUP_PHY_ATTRIBUTE(loss_of_dword_sync_count);
+	SETUP_PHY_ATTRIBUTE(phy_reset_problem_count);
+	SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(link_reset, phy_reset);
+	SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(hard_reset, phy_reset);
 	i->phy_attrs[count] = NULL;
 
+	count = 0;
+	SETUP_PORT_ATTRIBUTE(num_phys);
+	i->port_attrs[count] = NULL;
+
 	count = 0;
 	SETUP_RPORT_ATTRIBUTE(rphy_initiator_port_protocols);
 	SETUP_RPORT_ATTRIBUTE(rphy_target_port_protocols);
@@ -1131,6 +1419,7 @@ void sas_release_transport(struct scsi_transport_template *t)
 
 	transport_container_unregister(&i->t.host_attrs);
 	transport_container_unregister(&i->phy_attr_cont);
+	transport_container_unregister(&i->port_attr_cont);
 	transport_container_unregister(&i->rphy_attr_cont);
 	transport_container_unregister(&i->end_dev_attr_cont);
 	transport_container_unregister(&i->expander_attr_cont);
@@ -1149,9 +1438,12 @@ static __init int sas_transport_init(void)
 	error = transport_class_register(&sas_phy_class);
 	if (error)
 		goto out_unregister_transport;
-	error = transport_class_register(&sas_rphy_class);
+	error = transport_class_register(&sas_port_class);
 	if (error)
 		goto out_unregister_phy;
+	error = transport_class_register(&sas_rphy_class);
+	if (error)
+		goto out_unregister_port;
 	error = transport_class_register(&sas_end_dev_class);
 	if (error)
 		goto out_unregister_rphy;
@@ -1165,6 +1457,8 @@ static __init int sas_transport_init(void)
 	transport_class_unregister(&sas_end_dev_class);
  out_unregister_rphy:
 	transport_class_unregister(&sas_rphy_class);
+ out_unregister_port:
+	transport_class_unregister(&sas_port_class);
  out_unregister_phy:
 	transport_class_unregister(&sas_phy_class);
  out_unregister_transport:
@@ -1178,6 +1472,7 @@ static void __exit sas_transport_exit(void)
 {
 	transport_class_unregister(&sas_host_class);
 	transport_class_unregister(&sas_phy_class);
+	transport_class_unregister(&sas_port_class);
 	transport_class_unregister(&sas_rphy_class);
 	transport_class_unregister(&sas_end_dev_class);
 	transport_class_unregister(&sas_expander_class);
diff --git a/include/scsi/scsi_transport_sas.h b/include/scsi/scsi_transport_sas.h
index 93cfb4bf4211..e3c503cd175e 100644
--- a/include/scsi/scsi_transport_sas.h
+++ b/include/scsi/scsi_transport_sas.h
@@ -3,6 +3,7 @@
 
 #include <linux/transport_class.h>
 #include <linux/types.h>
+#include <linux/mutex.h>
 
 struct scsi_transport_template;
 struct sas_rphy;
@@ -55,7 +56,6 @@ struct sas_phy {
 	enum sas_linkrate	minimum_linkrate;
 	enum sas_linkrate	maximum_linkrate_hw;
 	enum sas_linkrate	maximum_linkrate;
-	u8			port_identifier;
 
 	/* internal state */
 	unsigned int		local_attached : 1;
@@ -66,8 +66,8 @@ struct sas_phy {
 	u32			loss_of_dword_sync_count;
 	u32			phy_reset_problem_count;
 
-	/* the other end of the link */
-	struct sas_rphy		*rphy;
+	/* for the list of phys belonging to a port */
+	struct list_head	port_siblings;
 };
 
 #define dev_to_phy(d) \
@@ -124,6 +124,24 @@ struct sas_expander_device {
 #define rphy_to_expander_device(r) \
 	container_of((r), struct sas_expander_device, rphy)
 
+struct sas_port {
+	struct device		dev;
+
+	u8			port_identifier;
+	int			num_phys;
+
+	/* the other end of the link */
+	struct sas_rphy		*rphy;
+
+	struct mutex		phy_list_mutex;
+	struct list_head	phy_list;
+};
+
+#define dev_to_sas_port(d) \
+	container_of((d), struct sas_port, dev)
+#define transport_class_to_sas_port(cdev) \
+	dev_to_sas_port((cdev)->dev)
+
 /* The functions by which the transport class and the driver communicate */
 struct sas_function_template {
 	int (*get_linkerrors)(struct sas_phy *);
@@ -133,6 +151,7 @@ struct sas_function_template {
 };
 
 
+void sas_remove_children(struct device *);
 extern void sas_remove_host(struct Scsi_Host *);
 
 extern struct sas_phy *sas_phy_alloc(struct device *, int);
@@ -141,13 +160,21 @@ extern int sas_phy_add(struct sas_phy *);
 extern void sas_phy_delete(struct sas_phy *);
 extern int scsi_is_sas_phy(const struct device *);
 
-extern struct sas_rphy *sas_end_device_alloc(struct sas_phy *);
-extern struct sas_rphy *sas_expander_alloc(struct sas_phy *, enum sas_device_type);
+extern struct sas_rphy *sas_end_device_alloc(struct sas_port *);
+extern struct sas_rphy *sas_expander_alloc(struct sas_port *, enum sas_device_type);
 void sas_rphy_free(struct sas_rphy *);
 extern int sas_rphy_add(struct sas_rphy *);
 extern void sas_rphy_delete(struct sas_rphy *);
 extern int scsi_is_sas_rphy(const struct device *);
 
+struct sas_port *sas_port_alloc(struct device *, int);
+int sas_port_add(struct sas_port *);
+void sas_port_free(struct sas_port *);
+void sas_port_delete(struct sas_port *);
+void sas_port_add_phy(struct sas_port *, struct sas_phy *);
+void sas_port_delete_phy(struct sas_port *, struct sas_phy *);
+int scsi_is_sas_port(const struct device *);
+
 extern struct scsi_transport_template *
 sas_attach_transport(struct sas_function_template *);
 extern void sas_release_transport(struct scsi_transport_template *);
-- 
cgit v1.2.3


From 844d3b427ef1a4f96e54866747bdb6c6cbca4c6a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 28 Jun 2006 21:48:27 -0700
Subject: MTD: fix all kernel-doc warnings

Fix all kernel-doc warnings in MTD headers and source files:
- add some missing struct fields;
- correct some function parameter names;
- use kernel-doc format for function doc. headers;
- nand_ecc.c contains only exported interfaces, no internal ones;

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 Documentation/DocBook/mtdnand.tmpl |  4 +++-
 drivers/mtd/nand/nand_base.c       | 16 ++++++++--------
 drivers/mtd/nand/nand_ecc.c        |  3 +--
 include/linux/mtd/nand.h           | 13 ++++++++++---
 4 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 6e463d0db266..32f385605981 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -1295,7 +1295,9 @@ in this page</entry>
      </para>
 !Idrivers/mtd/nand/nand_base.c
 !Idrivers/mtd/nand/nand_bbt.c
-!Idrivers/mtd/nand/nand_ecc.c
+<!-- No internal functions for kernel-doc:
+X!Idrivers/mtd/nand/nand_ecc.c
+-->
   </chapter>
 
   <chapter id="credits">
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 80a76654d963..62b861304e03 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -155,7 +155,7 @@ static u16 nand_read_word(struct mtd_info *mtd)
 /**
  * nand_select_chip - [DEFAULT] control CE line
  * @mtd:	MTD device structure
- * @chip:	chipnumber to select, -1 for deselect
+ * @chipnr:	chipnumber to select, -1 for deselect
  *
  * Default select function for 1 chip devices.
  */
@@ -542,7 +542,6 @@ static void nand_command(struct mtd_info *mtd, unsigned int command,
  * Send command to NAND device. This is the version for the new large page
  * devices We dont have the separate regions as we have in the small page
  * devices.  We must emulate NAND_CMD_READOOB to keep the code compatible.
- *
  */
 static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
 			    int column, int page_addr)
@@ -656,7 +655,7 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
 
 /**
  * nand_get_device - [GENERIC] Get chip for selected access
- * @this:	the nand chip descriptor
+ * @chip:	the nand chip descriptor
  * @mtd:	MTD device structure
  * @new_state:	the state which is requested
  *
@@ -696,13 +695,12 @@ nand_get_device(struct nand_chip *chip, struct mtd_info *mtd, int new_state)
 /**
  * nand_wait - [DEFAULT]  wait until the command is done
  * @mtd:	MTD device structure
- * @this:	NAND chip structure
+ * @chip:	NAND chip structure
  *
  * Wait for command done. This applies to erase and program only
  * Erase can take up to 400ms and program up to 20ms according to
  * general NAND and SmartMedia specs
- *
-*/
+ */
 static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
 {
 
@@ -896,6 +894,7 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 /**
  * nand_transfer_oob - [Internal] Transfer oob to client buffer
  * @chip:	nand chip structure
+ * @oob:	oob destination address
  * @ops:	oob ops structure
  */
 static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
@@ -946,6 +945,7 @@ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
  *
  * @mtd:	MTD device structure
  * @from:	offset to read from
+ * @ops:	oob ops structure
  *
  * Internal function. Called with chip held.
  */
@@ -1760,7 +1760,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 /**
  * nand_write_oob - [MTD Interface] NAND write data and/or out-of-band
  * @mtd:	MTD device structure
- * @from:	offset to read from
+ * @to:		offset to write to
  * @ops:	oob operation description structure
  */
 static int nand_write_oob(struct mtd_info *mtd, loff_t to,
@@ -2055,7 +2055,7 @@ static void nand_sync(struct mtd_info *mtd)
 /**
  * nand_block_isbad - [MTD Interface] Check if block at offset is bad
  * @mtd:	MTD device structure
- * @ofs:	offset relative to mtd start
+ * @offs:	offset relative to mtd start
  */
 static int nand_block_isbad(struct mtd_info *mtd, loff_t offs)
 {
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index 2a163e4084df..dd438ca47d9a 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -65,8 +65,7 @@ static const u_char nand_ecc_precalc_table[] = {
 };
 
 /**
- * nand_calculate_ecc - [NAND Interface] Calculate 3 byte ECC code
- *			for 256 byte block
+ * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256-byte block
  * @mtd:	MTD block structure
  * @dat:	raw data
  * @ecc_code:	buffer for ECC
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 66559272ebcb..2266f032a8c6 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -202,7 +202,7 @@ typedef enum {
 struct nand_chip;
 
 /**
- * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independend devices
+ * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices
  * @lock:               protection lock
  * @active:		the mtd device which holds the controller currently
  * @wq:			wait queue to sleep on if a NAND operation is in progress
@@ -223,12 +223,15 @@ struct nand_hw_control {
  * @total:	total number of ecc bytes per page
  * @prepad:	padding information for syndrome based ecc generators
  * @postpad:	padding information for syndrome based ecc generators
+ * @layout:	ECC layout control struct pointer
  * @hwctl:	function to control hardware ecc generator. Must only
  *		be provided if an hardware ECC is available
  * @calculate:	function for ecc calculation or readback from ecc hardware
  * @correct:	function for ecc correction, matching to ecc generator (sw/hw)
  * @read_page:	function to read a page according to the ecc generator requirements
  * @write_page:	function to write a page according to the ecc generator requirements
+ * @read_oob:	function to read chip OOB data
+ * @write_oob:	function to write chip OOB data
  */
 struct nand_ecc_ctrl {
 	nand_ecc_modes_t	mode;
@@ -300,11 +303,15 @@ struct nand_buffers {
  * @cmdfunc:		[REPLACEABLE] hardwarespecific function for writing commands to the chip
  * @waitfunc:		[REPLACEABLE] hardwarespecific function for wait on ready
  * @ecc:		[BOARDSPECIFIC] ecc control ctructure
+ * @buffers:		buffer structure for read/write
+ * @hwcontrol:		platform-specific hardware control structure
+ * @ops:		oob operation operands
  * @erase_cmd:		[INTERN] erase command write function, selectable due to AND support
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR)
  * @wq:			[INTERN] wait queue to sleep on if a NAND operation is in progress
  * @state:		[INTERN] the current state of the NAND device
+ * @oob_poi:		poison value buffer
  * @page_shift:		[INTERN] number of address bits in a page (column address bits)
  * @phys_erase_shift:	[INTERN] number of address bits in a physical eraseblock
  * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
@@ -521,7 +528,7 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
  * struct platform_nand_chip - chip level device structure
  *
  * @nr_chips:		max. number of chips to scan for
- * @chip_offs:		chip number offset
+ * @chip_offset:	chip number offset
  * @nr_partitions:	number of partitions pointed to by partitions (or zero)
  * @partitions:		mtd partition list
  * @chip_delay:		R/B delay value in us
@@ -546,7 +553,7 @@ struct platform_nand_chip {
  * @hwcontrol:		platform specific hardware control structure
  * @dev_ready:		platform specific function to read ready/busy pin
  * @select_chip:	platform specific chip select function
- * @priv_data:		private data to transport driver specific settings
+ * @priv:		private data to transport driver specific settings
  *
  * All fields are optional and depend on the hardware driver requirements
  */
-- 
cgit v1.2.3


From ea9b6dcc152f09c207117ab121d4fa03d2db282a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 28 Jun 2006 21:48:38 -0700
Subject: MTD: kernel-doc fixes + additions

Fix some kernel-doc typos/spellos.
Use kernel-doc syntax in places where it was almost used.
Correct/add struct, struct field, and function param names where needed.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 Documentation/DocBook/mtdnand.tmpl |  7 ++--
 include/linux/mtd/bbm.h            | 35 +++++++++--------
 include/linux/mtd/mtd.h            |  4 +-
 include/linux/mtd/nand.h           |  3 --
 include/linux/mtd/onenand.h        | 77 ++++++++++++++++++++++----------------
 include/mtd/mtd-abi.h              |  2 +-
 6 files changed, 69 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 32f385605981..630159c5bdba 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -109,7 +109,7 @@
 		for most of the implementations. These functions can be replaced by the
 		board driver if neccecary. Those functions are called via pointers in the
 		NAND chip description structure. The board driver can set the functions which
-		should be replaced by board dependend functions before calling nand_scan().
+		should be replaced by board dependent functions before calling nand_scan().
 		If the function pointer is NULL on entry to nand_scan() then the pointer
 		is set to the default function which is suitable for the detected chip type.
 		</para></listitem>
@@ -133,7 +133,7 @@
 	  	[REPLACEABLE]</para><para>
 		Replaceable members hold hardware related functions which can be 
 		provided by the board driver. The board driver can set the functions which
-		should be replaced by board dependend functions before calling nand_scan().
+		should be replaced by board dependent functions before calling nand_scan().
 		If the function pointer is NULL on entry to nand_scan() then the pointer
 		is set to the default function which is suitable for the detected chip type.
 		</para></listitem>
@@ -156,9 +156,8 @@
      	<title>Basic board driver</title>
 	<para>
 		For most boards it will be sufficient to provide just the
-		basic functions and fill out some really board dependend
+		basic functions and fill out some really board dependent
 		members in the nand chip description structure.
-		See drivers/mtd/nand/skeleton for reference.
 	</para>
 	<sect1>
 		<title>Basic defines</title>
diff --git a/include/linux/mtd/bbm.h b/include/linux/mtd/bbm.h
index 7a7fbe87fef0..1221b7c44158 100644
--- a/include/linux/mtd/bbm.h
+++ b/include/linux/mtd/bbm.h
@@ -19,21 +19,21 @@
 
 /**
  * struct nand_bbt_descr - bad block table descriptor
- * @param options	options for this descriptor
- * @param pages		the page(s) where we find the bbt, used with
+ * @options:		options for this descriptor
+ * @pages:		the page(s) where we find the bbt, used with
  * 			option BBT_ABSPAGE when bbt is searched,
  * 			then we store the found bbts pages here.
  *			Its an array and supports up to 8 chips now
- * @param offs		offset of the pattern in the oob area of the page
- * @param veroffs	offset of the bbt version counter in the oob are of the page
- * @param version	version read from the bbt page during scan
- * @param len		length of the pattern, if 0 no pattern check is performed
- * @param maxblocks	maximum number of blocks to search for a bbt. This number of
- *			blocks is reserved at the end of the device
+ * @offs:		offset of the pattern in the oob area of the page
+ * @veroffs:		offset of the bbt version counter in the oob area of the page
+ * @version:		version read from the bbt page during scan
+ * @len:		length of the pattern, if 0 no pattern check is performed
+ * @maxblocks:		maximum number of blocks to search for a bbt. This
+ *			number of blocks is reserved at the end of the device
  *			where the tables are written.
- * @param reserved_block_code	if non-0, this pattern denotes a reserved
+ * @reserved_block_code: if non-0, this pattern denotes a reserved
  *			(rather than bad) block in the stored bbt
- * @param pattern	pattern to identify bad block table or factory marked
+ * @pattern:		pattern to identify bad block table or factory marked
  *			good / bad blocks, can be NULL, if len = 0
  *
  * Descriptor for the bad block table marker and the descriptor for the
@@ -93,12 +93,15 @@ struct nand_bbt_descr {
 #define ONENAND_BADBLOCK_POS	0
 
 /**
- * struct bbt_info - [GENERIC] Bad Block Table data structure
- * @param bbt_erase_shift	[INTERN] number of address bits in a bbt entry
- * @param badblockpos		[INTERN] position of the bad block marker in the oob area
- * @param bbt			[INTERN] bad block table pointer
- * @param badblock_pattern	[REPLACEABLE] bad block scan pattern used for initial bad block scan
- * @param priv			[OPTIONAL] pointer to private bbm date
+ * struct bbm_info - [GENERIC] Bad Block Table data structure
+ * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
+ * @badblockpos:	[INTERN] position of the bad block marker in the oob area
+ * @options:		options for this descriptor
+ * @bbt:		[INTERN] bad block table pointer
+ * @isbad_bbt:		function to determine if a block is bad
+ * @badblock_pattern:	[REPLACEABLE] bad block scan pattern used for
+ *			initial bad block scan
+ * @priv:		[OPTIONAL] pointer to private bbm date
  */
 struct bbm_info {
 	int bbt_erase_shift;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 9b7a2b525d63..94a443d45258 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -77,11 +77,11 @@ typedef enum {
  *
  * @len:	number of bytes to write/read. When a data buffer is given
  *		(datbuf != NULL) this is the number of data bytes. When
- +		no data buffer is available this is the number of oob bytes.
+ *		no data buffer is available this is the number of oob bytes.
  *
  * @retlen:	number of bytes written/read. When a data buffer is given
  *		(datbuf != NULL) this is the number of data bytes. When
- +		no data buffer is available this is the number of oob bytes.
+ *		no data buffer is available this is the number of oob bytes.
  *
  * @ooblen:	number of oob bytes per page
  * @ooboffs:	offset of oob data in the oob area (only relevant when
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 2266f032a8c6..0b4cd2fa64aa 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -407,7 +407,6 @@ struct nand_chip {
 
 /**
  * struct nand_flash_dev - NAND Flash Device ID Structure
- *
  * @name:	Identify the device type
  * @id:		device ID code
  * @pagesize:	Pagesize in bytes. Either 256 or 512 or 0
@@ -526,7 +525,6 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
 
 /**
  * struct platform_nand_chip - chip level device structure
- *
  * @nr_chips:		max. number of chips to scan for
  * @chip_offset:	chip number offset
  * @nr_partitions:	number of partitions pointed to by partitions (or zero)
@@ -549,7 +547,6 @@ struct platform_nand_chip {
 
 /**
  * struct platform_nand_ctrl - controller level device structure
- *
  * @hwcontrol:		platform specific hardware control structure
  * @dev_ready:		platform specific function to read ready/busy pin
  * @select_chip:	platform specific chip select function
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 9ce9a48db444..1f4972155249 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -23,7 +23,7 @@ extern int onenand_scan(struct mtd_info *mtd, int max_chips);
 /* Free resources held by the OneNAND device */
 extern void onenand_release(struct mtd_info *mtd);
 
-/**
+/*
  * onenand_state_t - chip states
  * Enumeration for OneNAND flash chip state
  */
@@ -42,9 +42,9 @@ typedef enum {
 
 /**
  * struct onenand_bufferram - OneNAND BufferRAM Data
- * @param block		block address in BufferRAM
- * @param page		page address in BufferRAM
- * @param valid		valid flag
+ * @block:		block address in BufferRAM
+ * @page:		page address in BufferRAM
+ * @valid:		valid flag
  */
 struct onenand_bufferram {
 	int block;
@@ -54,32 +54,43 @@ struct onenand_bufferram {
 
 /**
  * struct onenand_chip - OneNAND Private Flash Chip Data
- * @param base		[BOARDSPECIFIC] address to access OneNAND
- * @param chipsize	[INTERN] the size of one chip for multichip arrays
- * @param device_id	[INTERN] device ID
- * @param verstion_id	[INTERN] version ID
- * @param options	[BOARDSPECIFIC] various chip options. They can partly be set to inform onenand_scan about
- * @param erase_shift	[INTERN] number of address bits in a block
- * @param page_shift	[INTERN] number of address bits in a page
- * @param ppb_shift	[INTERN] number of address bits in a pages per block
- * @param page_mask	[INTERN] a page per block mask
- * @param bufferam_index	[INTERN] BufferRAM index
- * @param bufferam	[INTERN] BufferRAM info
- * @param readw		[REPLACEABLE] hardware specific function for read short
- * @param writew	[REPLACEABLE] hardware specific function for write short
- * @param command	[REPLACEABLE] hardware specific function for writing commands to the chip
- * @param wait		[REPLACEABLE] hardware specific function for wait on ready
- * @param read_bufferram	[REPLACEABLE] hardware specific function for BufferRAM Area
- * @param write_bufferram	[REPLACEABLE] hardware specific function for BufferRAM Area
- * @param read_word	[REPLACEABLE] hardware specific function for read register of OneNAND
- * @param write_word	[REPLACEABLE] hardware specific function for write register of OneNAND
- * @param scan_bbt	[REPLACEALBE] hardware specific function for scaning Bad block Table
- * @param chip_lock	[INTERN] spinlock used to protect access to this structure and the chip
- * @param wq		[INTERN] wait queue to sleep on if a OneNAND operation is in progress
- * @param state		[INTERN] the current state of the OneNAND device
- * @param ecclayout	[REPLACEABLE] the default ecc placement scheme
- * @param bbm		[REPLACEABLE] pointer to Bad Block Management
- * @param priv		[OPTIONAL] pointer to private chip date
+ * @base:		[BOARDSPECIFIC] address to access OneNAND
+ * @chipsize:		[INTERN] the size of one chip for multichip arrays
+ * @device_id:		[INTERN] device ID
+ * @density_mask:	chip density, used for DDP devices
+ * @verstion_id:	[INTERN] version ID
+ * @options:		[BOARDSPECIFIC] various chip options. They can
+ *			partly be set to inform onenand_scan about
+ * @erase_shift:	[INTERN] number of address bits in a block
+ * @page_shift:		[INTERN] number of address bits in a page
+ * @ppb_shift:		[INTERN] number of address bits in a pages per block
+ * @page_mask:		[INTERN] a page per block mask
+ * @bufferram_index:	[INTERN] BufferRAM index
+ * @bufferram:		[INTERN] BufferRAM info
+ * @readw:		[REPLACEABLE] hardware specific function for read short
+ * @writew:		[REPLACEABLE] hardware specific function for write short
+ * @command:		[REPLACEABLE] hardware specific function for writing
+ *			commands to the chip
+ * @wait:		[REPLACEABLE] hardware specific function for wait on ready
+ * @read_bufferram:	[REPLACEABLE] hardware specific function for BufferRAM Area
+ * @write_bufferram:	[REPLACEABLE] hardware specific function for BufferRAM Area
+ * @read_word:		[REPLACEABLE] hardware specific function for read
+ *			register of OneNAND
+ * @write_word:		[REPLACEABLE] hardware specific function for write
+ *			register of OneNAND
+ * @mmcontrol:		sync burst read function
+ * @block_markbad:	function to mark a block as bad
+ * @scan_bbt:		[REPLACEALBE] hardware specific function for scanning
+ *			Bad block Table
+ * @chip_lock:		[INTERN] spinlock used to protect access to this
+ *			structure and the chip
+ * @wq:			[INTERN] wait queue to sleep on if a OneNAND
+ *			operation is in progress
+ * @state:		[INTERN] the current state of the OneNAND device
+ * @page_buf:		data buffer
+ * @ecclayout:		[REPLACEABLE] the default ecc placement scheme
+ * @bbm:		[REPLACEABLE] pointer to Bad Block Management
+ * @priv:		[OPTIONAL] pointer to private chip date
  */
 struct onenand_chip {
 	void __iomem		*base;
@@ -147,9 +158,9 @@ struct onenand_chip {
 #define ONENAND_MFR_SAMSUNG	0xec
 
 /**
- * struct nand_manufacturers - NAND Flash Manufacturer ID Structure
- * @param name:		Manufacturer name
- * @param id:		manufacturer ID code of device.
+ * struct onenand_manufacturers - NAND Flash Manufacturer ID Structure
+ * @name:	Manufacturer name
+ * @id:		manufacturer ID code of device.
 */
 struct onenand_manufacturers {
         int id;
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index 31329fce1ff5..1da3f7fa7993 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -133,7 +133,7 @@ struct nand_ecclayout {
 };
 
 /**
- * struct mtd_ecc_stats - error correction status
+ * struct mtd_ecc_stats - error correction stats
  *
  * @corrected:	number of corrected bits
  * @failed:	number of uncorrectable errors
-- 
cgit v1.2.3


From 01cb225dad8da2e717356fab03240e2f4a8d01bf Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 28 Jun 2006 12:00:22 -0500
Subject: [SCSI] iscsi: add target discvery event to transport class

Patch from david.somayajulu@qlogic.com:

Add target discovery event. We may have a setup where the iscsi traffic
is on a different netowrk than the other network traffic. In this case
we will want to do discovery though the iscsi card. This patch adds
a event to the transport class that can be used by hw iscsi cards that
support this.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 18 ++++++++++++++++++
 include/scsi/iscsi_if.h             | 37 +++++++++++++++++++++++++++++++++++++
 include/scsi/scsi_transport_iscsi.h |  2 ++
 3 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 5569fdcfd621..99e76d458290 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -977,6 +977,21 @@ iscsi_if_transport_ep(struct iscsi_transport *transport,
 	return rc;
 }
 
+static int
+iscsi_tgt_dscvr(struct iscsi_transport *transport,
+		struct iscsi_uevent *ev)
+{
+	struct sockaddr *dst_addr;
+
+	if (!transport->tgt_dscvr)
+		return -EINVAL;
+
+	dst_addr = (struct sockaddr *)((char*)ev + sizeof(*ev));
+	return transport->tgt_dscvr(ev->u.tgt_dscvr.type,
+				    ev->u.tgt_dscvr.host_no,
+				    ev->u.tgt_dscvr.enable, dst_addr);
+}
+
 static int
 iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
@@ -1065,6 +1080,9 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT:
 		err = iscsi_if_transport_ep(transport, ev, nlh->nlmsg_type);
 		break;
+	case ISCSI_UEVENT_TGT_DSCVR:
+		err = iscsi_tgt_dscvr(transport, ev);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/include/scsi/iscsi_if.h b/include/scsi/iscsi_if.h
index 253797c60095..8813f0f4c624 100644
--- a/include/scsi/iscsi_if.h
+++ b/include/scsi/iscsi_if.h
@@ -47,12 +47,20 @@ enum iscsi_uevent_e {
 	ISCSI_UEVENT_TRANSPORT_EP_POLL		= UEVENT_BASE + 13,
 	ISCSI_UEVENT_TRANSPORT_EP_DISCONNECT	= UEVENT_BASE + 14,
 
+	ISCSI_UEVENT_TGT_DSCVR		= UEVENT_BASE + 15,
+
 	/* up events */
 	ISCSI_KEVENT_RECV_PDU		= KEVENT_BASE + 1,
 	ISCSI_KEVENT_CONN_ERROR		= KEVENT_BASE + 2,
 	ISCSI_KEVENT_IF_ERROR		= KEVENT_BASE + 3,
 };
 
+enum iscsi_tgt_dscvr {
+	ISCSI_TGT_DSCVR_SEND_TARGETS	= 1,
+	ISCSI_TGT_DSCVR_ISNS		= 2,
+	ISCSI_TGT_DSCVR_SLP		= 3,
+};
+
 struct iscsi_uevent {
 	uint32_t type; /* k/u events type */
 	uint32_t iferror; /* carries interface or resource errors */
@@ -116,6 +124,17 @@ struct iscsi_uevent {
 		struct msg_transport_disconnect {
 			uint64_t	ep_handle;
 		} ep_disconnect;
+		struct msg_tgt_dscvr {
+			enum iscsi_tgt_dscvr	type;
+			uint32_t	host_no;
+			/*
+ 			 * enable = 1 to establish a new connection
+			 * with the server. enable = 0 to disconnect
+			 * from the server. Used primarily to switch
+			 * from one iSNS server to another.
+			 */
+			uint32_t	enable;
+		} tgt_dscvr;
 	} u;
 	union {
 		/* messages k -> u */
@@ -141,6 +160,24 @@ struct iscsi_uevent {
 		struct msg_transport_connect_ret {
 			uint64_t	handle;
 		} ep_connect_ret;
+		struct msg_tgt_dscvr_ret {
+			/*
+			 * session/connection pair used to reference
+			 * the connection to server
+			 */
+			uint32_t	sid;
+			uint32_t	cid;
+			union {
+				struct isns {
+					/* port # for conn to iSNS server */
+					uint16_t isns_port;
+					/* listening port to receive SCNs */
+					uint16_t scn_port;
+					/* listening port to receive ESIs */
+					uint16_t esi_port;
+				} isns_attrib;
+			} u;
+		} tgt_dscvr_ret;
 	} r;
 } __attribute__ ((aligned (sizeof(uint64_t))));
 
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index b684426a5900..b95151aec602 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -127,6 +127,8 @@ struct iscsi_transport {
 			   uint64_t *ep_handle);
 	int (*ep_poll) (uint64_t ep_handle, int timeout_ms);
 	void (*ep_disconnect) (uint64_t ep_handle);
+	int (*tgt_dscvr) (enum iscsi_tgt_dscvr type, uint32_t host_no,
+			  uint32_t enable, struct sockaddr *dst_addr);
 };
 
 /*
-- 
cgit v1.2.3


From a54a52caad4bd6166cb7fa64e4e93031fa2fda5d Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 28 Jun 2006 12:00:23 -0500
Subject: [SCSI] iscsi: fixup set/get param functions

Reduce duplication in the software iscsi_transport modules by
adding a libiscsi function to handle the common grunt work.

This also has the drivers return specifc -EXXX values for different
errors so userspace can finally handle them in a sane way.

Also just pass the sysfs buffers to the drivers so HW iscsi can
get/set its string values, like targetname, and initiatorname.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/libiscsi.c             | 179 ++++++++++++++++++++++++++++++
 drivers/scsi/scsi_transport_iscsi.c | 210 ++++++++----------------------------
 include/scsi/libiscsi.h             |  15 ++-
 include/scsi/scsi_transport_iscsi.h |  29 ++---
 4 files changed, 246 insertions(+), 187 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 2673a11a9495..7c76a989b218 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1697,6 +1697,185 @@ int iscsi_conn_bind(struct iscsi_cls_session *cls_session,
 }
 EXPORT_SYMBOL_GPL(iscsi_conn_bind);
 
+
+int iscsi_set_param(struct iscsi_cls_conn *cls_conn,
+		    enum iscsi_param param, char *buf, int buflen)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	uint32_t value;
+
+	switch(param) {
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		sscanf(buf, "%d", &conn->max_recv_dlength);
+		break;
+	case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		sscanf(buf, "%d", &conn->max_xmit_dlength);
+		break;
+	case ISCSI_PARAM_HDRDGST_EN:
+		sscanf(buf, "%d", &conn->hdrdgst_en);
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		sscanf(buf, "%d", &conn->datadgst_en);
+		break;
+	case ISCSI_PARAM_INITIAL_R2T_EN:
+		sscanf(buf, "%d", &session->initial_r2t_en);
+		break;
+	case ISCSI_PARAM_MAX_R2T:
+		sscanf(buf, "%d", &session->max_r2t);
+		break;
+	case ISCSI_PARAM_IMM_DATA_EN:
+		sscanf(buf, "%d", &session->imm_data_en);
+		break;
+	case ISCSI_PARAM_FIRST_BURST:
+		sscanf(buf, "%d", &session->first_burst);
+		break;
+	case ISCSI_PARAM_MAX_BURST:
+		sscanf(buf, "%d", &session->max_burst);
+		break;
+	case ISCSI_PARAM_PDU_INORDER_EN:
+		sscanf(buf, "%d", &session->pdu_inorder_en);
+		break;
+	case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		sscanf(buf, "%d", &session->dataseq_inorder_en);
+		break;
+	case ISCSI_PARAM_ERL:
+		sscanf(buf, "%d", &session->erl);
+		break;
+	case ISCSI_PARAM_IFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		BUG_ON(value);
+		break;
+	case ISCSI_PARAM_OFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		BUG_ON(value);
+		break;
+	case ISCSI_PARAM_EXP_STATSN:
+		sscanf(buf, "%u", &conn->exp_statsn);
+		break;
+	case ISCSI_PARAM_TARGET_NAME:
+		/* this should not change between logins */
+		if (session->targetname)
+			break;
+
+		session->targetname = kstrdup(buf, GFP_KERNEL);
+		if (!session->targetname)
+			return -ENOMEM;
+		break;
+	case ISCSI_PARAM_TPGT:
+		sscanf(buf, "%d", &session->tpgt);
+		break;
+	case ISCSI_PARAM_PERSISTENT_PORT:
+		sscanf(buf, "%d", &conn->persistent_port);
+		break;
+	case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		/*
+		 * this is the address returned in discovery so it should
+		 * not change between logins.
+		 */
+		if (conn->persistent_address)
+			break;
+
+		conn->persistent_address = kstrdup(buf, GFP_KERNEL);
+		if (!conn->persistent_address)
+			return -ENOMEM;
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iscsi_set_param);
+
+int iscsi_session_get_param(struct iscsi_cls_session *cls_session,
+			    enum iscsi_param param, char *buf)
+{
+	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+	struct iscsi_session *session = iscsi_hostdata(shost->hostdata);
+	int len;
+
+	switch(param) {
+	case ISCSI_PARAM_INITIAL_R2T_EN:
+		len = sprintf(buf, "%d\n", session->initial_r2t_en);
+		break;
+	case ISCSI_PARAM_MAX_R2T:
+		len = sprintf(buf, "%hu\n", session->max_r2t);
+		break;
+	case ISCSI_PARAM_IMM_DATA_EN:
+		len = sprintf(buf, "%d\n", session->imm_data_en);
+		break;
+	case ISCSI_PARAM_FIRST_BURST:
+		len = sprintf(buf, "%u\n", session->first_burst);
+		break;
+	case ISCSI_PARAM_MAX_BURST:
+		len = sprintf(buf, "%u\n", session->max_burst);
+		break;
+	case ISCSI_PARAM_PDU_INORDER_EN:
+		len = sprintf(buf, "%d\n", session->pdu_inorder_en);
+		break;
+	case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		len = sprintf(buf, "%d\n", session->dataseq_inorder_en);
+		break;
+	case ISCSI_PARAM_ERL:
+		len = sprintf(buf, "%d\n", session->erl);
+		break;
+	case ISCSI_PARAM_TARGET_NAME:
+		len = sprintf(buf, "%s\n", session->targetname);
+		break;
+	case ISCSI_PARAM_TPGT:
+		len = sprintf(buf, "%d\n", session->tpgt);
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(iscsi_session_get_param);
+
+int iscsi_conn_get_param(struct iscsi_cls_conn *cls_conn,
+			 enum iscsi_param param, char *buf)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	int len;
+
+	switch(param) {
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		len = sprintf(buf, "%u\n", conn->max_recv_dlength);
+		break;
+	case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		len = sprintf(buf, "%u\n", conn->max_xmit_dlength);
+		break;
+	case ISCSI_PARAM_HDRDGST_EN:
+		len = sprintf(buf, "%d\n", conn->hdrdgst_en);
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		len = sprintf(buf, "%d\n", conn->datadgst_en);
+		break;
+	case ISCSI_PARAM_IFMARKER_EN:
+		len = sprintf(buf, "%d\n", conn->ifmarker_en);
+		break;
+	case ISCSI_PARAM_OFMARKER_EN:
+		len = sprintf(buf, "%d\n", conn->ofmarker_en);
+		break;
+	case ISCSI_PARAM_EXP_STATSN:
+		len = sprintf(buf, "%u\n", conn->exp_statsn);
+		break;
+	case ISCSI_PARAM_PERSISTENT_PORT:
+		len = sprintf(buf, "%d\n", conn->persistent_port);
+		break;
+	case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		len = sprintf(buf, "%s\n", conn->persistent_address);
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(iscsi_conn_get_param);
+
 MODULE_AUTHOR("Mike Christie");
 MODULE_DESCRIPTION("iSCSI library functions");
 MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 99e76d458290..147c854e1d4d 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -233,7 +233,6 @@ static void iscsi_session_release(struct device *dev)
 
 	shost = iscsi_session_to_shost(session);
 	scsi_host_put(shost);
-	kfree(session->targetname);
 	kfree(session);
 	module_put(transport->owner);
 }
@@ -388,7 +387,6 @@ static void iscsi_conn_release(struct device *dev)
 	struct iscsi_cls_conn *conn = iscsi_dev_to_conn(dev);
 	struct device *parent = conn->dev.parent;
 
-	kfree(conn->persistent_address);
 	kfree(conn);
 	put_device(parent);
 }
@@ -877,23 +875,13 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
 	return 0;
 }
 
-static void
-iscsi_copy_param(struct iscsi_uevent *ev, uint32_t *value, char *data)
-{
-	if (ev->u.set_param.len != sizeof(uint32_t))
-		BUG();
-	memcpy(value, data, min_t(uint32_t, sizeof(uint32_t),
-		ev->u.set_param.len));
-}
-
 static int
 iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 {
 	char *data = (char*)ev + sizeof(*ev);
 	struct iscsi_cls_conn *conn;
 	struct iscsi_cls_session *session;
-	int err = 0;
-	uint32_t value = 0;
+	int err = 0, value = 0;
 
 	session = iscsi_session_lookup(ev->u.set_param.sid);
 	conn = iscsi_conn_lookup(ev->u.set_param.sid, ev->u.set_param.cid);
@@ -902,42 +890,13 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 
 	switch (ev->u.set_param.param) {
 	case ISCSI_PARAM_SESS_RECOVERY_TMO:
-		iscsi_copy_param(ev, &value, data);
+		sscanf(data, "%d", &value);
 		if (value != 0)
 			session->recovery_tmo = value;
 		break;
-	case ISCSI_PARAM_TARGET_NAME:
-		/* this should not change between logins */
-		if (session->targetname)
-			return 0;
-
-		session->targetname = kstrdup(data, GFP_KERNEL);
-		if (!session->targetname)
-			return -ENOMEM;
-		break;
-	case ISCSI_PARAM_TPGT:
-		iscsi_copy_param(ev, &value, data);
-		session->tpgt = value;
-		break;
-	case ISCSI_PARAM_PERSISTENT_PORT:
-		iscsi_copy_param(ev, &value, data);
-		conn->persistent_port = value;
-		break;
-	case ISCSI_PARAM_PERSISTENT_ADDRESS:
-		/*
-		 * this is the address returned in discovery so it should
-		 * not change between logins.
-		 */
-		if (conn->persistent_address)
-			return 0;
-
-		conn->persistent_address = kstrdup(data, GFP_KERNEL);
-		if (!conn->persistent_address)
-			return -ENOMEM;
-		break;
 	default:
-		iscsi_copy_param(ev, &value, data);
-		err = transport->set_param(conn, ev->u.set_param.param, value);
+		err = transport->set_param(conn, ev->u.set_param.param,
+					   data, ev->u.set_param.len);
 	}
 
 	return err;
@@ -1165,49 +1124,31 @@ struct class_device_attribute class_device_attr_##_prefix##_##_name =	\
 /*
  * iSCSI connection attrs
  */
-#define iscsi_conn_int_attr_show(param, format)				\
+#define iscsi_conn_attr_show(param)					\
 static ssize_t								\
-show_conn_int_param_##param(struct class_device *cdev, char *buf)	\
+show_conn_param_##param(struct class_device *cdev, char *buf)		\
 {									\
-	uint32_t value = 0;						\
 	struct iscsi_cls_conn *conn = iscsi_cdev_to_conn(cdev);		\
 	struct iscsi_transport *t = conn->transport;			\
-									\
-	t->get_conn_param(conn, param, &value);				\
-	return snprintf(buf, 20, format"\n", value);			\
+	return t->get_conn_param(conn, param, buf);			\
 }
 
-#define iscsi_conn_int_attr(field, param, format)			\
-	iscsi_conn_int_attr_show(param, format)				\
-static ISCSI_CLASS_ATTR(conn, field, S_IRUGO, show_conn_int_param_##param, \
+#define iscsi_conn_attr(field, param)					\
+	iscsi_conn_attr_show(param)					\
+static ISCSI_CLASS_ATTR(conn, field, S_IRUGO, show_conn_param_##param,	\
 			NULL);
 
-iscsi_conn_int_attr(max_recv_dlength, ISCSI_PARAM_MAX_RECV_DLENGTH, "%u");
-iscsi_conn_int_attr(max_xmit_dlength, ISCSI_PARAM_MAX_XMIT_DLENGTH, "%u");
-iscsi_conn_int_attr(header_digest, ISCSI_PARAM_HDRDGST_EN, "%d");
-iscsi_conn_int_attr(data_digest, ISCSI_PARAM_DATADGST_EN, "%d");
-iscsi_conn_int_attr(ifmarker, ISCSI_PARAM_IFMARKER_EN, "%d");
-iscsi_conn_int_attr(ofmarker, ISCSI_PARAM_OFMARKER_EN, "%d");
-iscsi_conn_int_attr(persistent_port, ISCSI_PARAM_PERSISTENT_PORT, "%d");
-iscsi_conn_int_attr(port, ISCSI_PARAM_CONN_PORT, "%d");
-iscsi_conn_int_attr(exp_statsn, ISCSI_PARAM_EXP_STATSN, "%u");
-
-#define iscsi_conn_str_attr_show(param)					\
-static ssize_t								\
-show_conn_str_param_##param(struct class_device *cdev, char *buf)	\
-{									\
-	struct iscsi_cls_conn *conn = iscsi_cdev_to_conn(cdev);		\
-	struct iscsi_transport *t = conn->transport;			\
-	return t->get_conn_str_param(conn, param, buf);			\
-}
-
-#define iscsi_conn_str_attr(field, param)				\
-	iscsi_conn_str_attr_show(param)					\
-static ISCSI_CLASS_ATTR(conn, field, S_IRUGO, show_conn_str_param_##param, \
-			NULL);
-
-iscsi_conn_str_attr(persistent_address, ISCSI_PARAM_PERSISTENT_ADDRESS);
-iscsi_conn_str_attr(address, ISCSI_PARAM_CONN_ADDRESS);
+iscsi_conn_attr(max_recv_dlength, ISCSI_PARAM_MAX_RECV_DLENGTH);
+iscsi_conn_attr(max_xmit_dlength, ISCSI_PARAM_MAX_XMIT_DLENGTH);
+iscsi_conn_attr(header_digest, ISCSI_PARAM_HDRDGST_EN);
+iscsi_conn_attr(data_digest, ISCSI_PARAM_DATADGST_EN);
+iscsi_conn_attr(ifmarker, ISCSI_PARAM_IFMARKER_EN);
+iscsi_conn_attr(ofmarker, ISCSI_PARAM_OFMARKER_EN);
+iscsi_conn_attr(persistent_port, ISCSI_PARAM_PERSISTENT_PORT);
+iscsi_conn_attr(port, ISCSI_PARAM_CONN_PORT);
+iscsi_conn_attr(exp_statsn, ISCSI_PARAM_EXP_STATSN);
+iscsi_conn_attr(persistent_address, ISCSI_PARAM_PERSISTENT_ADDRESS);
+iscsi_conn_attr(address, ISCSI_PARAM_CONN_ADDRESS);
 
 #define iscsi_cdev_to_session(_cdev) \
 	iscsi_dev_to_session(_cdev->dev)
@@ -1215,61 +1156,36 @@ iscsi_conn_str_attr(address, ISCSI_PARAM_CONN_ADDRESS);
 /*
  * iSCSI session attrs
  */
-#define iscsi_session_int_attr_show(param, format)			\
-static ssize_t								\
-show_session_int_param_##param(struct class_device *cdev, char *buf)	\
-{									\
-	uint32_t value = 0;						\
-	struct iscsi_cls_session *session = iscsi_cdev_to_session(cdev);	\
-	struct iscsi_transport *t = session->transport;			\
-									\
-	t->get_session_param(session, param, &value);			\
-	return snprintf(buf, 20, format"\n", value);			\
-}
-
-#define iscsi_session_int_attr(field, param, format)			\
-	iscsi_session_int_attr_show(param, format)			\
-static ISCSI_CLASS_ATTR(sess, field, S_IRUGO, show_session_int_param_##param, \
-			NULL);
-
-iscsi_session_int_attr(initial_r2t, ISCSI_PARAM_INITIAL_R2T_EN, "%d");
-iscsi_session_int_attr(max_outstanding_r2t, ISCSI_PARAM_MAX_R2T, "%hu");
-iscsi_session_int_attr(immediate_data, ISCSI_PARAM_IMM_DATA_EN, "%d");
-iscsi_session_int_attr(first_burst_len, ISCSI_PARAM_FIRST_BURST, "%u");
-iscsi_session_int_attr(max_burst_len, ISCSI_PARAM_MAX_BURST, "%u");
-iscsi_session_int_attr(data_pdu_in_order, ISCSI_PARAM_PDU_INORDER_EN, "%d");
-iscsi_session_int_attr(data_seq_in_order, ISCSI_PARAM_DATASEQ_INORDER_EN, "%d");
-iscsi_session_int_attr(erl, ISCSI_PARAM_ERL, "%d");
-iscsi_session_int_attr(tpgt, ISCSI_PARAM_TPGT, "%d");
-
-#define iscsi_session_str_attr_show(param)				\
+#define iscsi_session_attr_show(param)					\
 static ssize_t								\
-show_session_str_param_##param(struct class_device *cdev, char *buf)	\
+show_session_param_##param(struct class_device *cdev, char *buf)	\
 {									\
 	struct iscsi_cls_session *session = iscsi_cdev_to_session(cdev); \
 	struct iscsi_transport *t = session->transport;			\
-	return t->get_session_str_param(session, param, buf);		\
+	return t->get_session_param(session, param, buf);		\
 }
 
-#define iscsi_session_str_attr(field, param)				\
-	iscsi_session_str_attr_show(param)				\
-static ISCSI_CLASS_ATTR(sess, field, S_IRUGO, show_session_str_param_##param, \
+#define iscsi_session_attr(field, param)				\
+	iscsi_session_attr_show(param)					\
+static ISCSI_CLASS_ATTR(sess, field, S_IRUGO, show_session_param_##param, \
 			NULL);
 
-iscsi_session_str_attr(targetname, ISCSI_PARAM_TARGET_NAME);
+iscsi_session_attr(targetname, ISCSI_PARAM_TARGET_NAME);
+iscsi_session_attr(initial_r2t, ISCSI_PARAM_INITIAL_R2T_EN);
+iscsi_session_attr(max_outstanding_r2t, ISCSI_PARAM_MAX_R2T);
+iscsi_session_attr(immediate_data, ISCSI_PARAM_IMM_DATA_EN);
+iscsi_session_attr(first_burst_len, ISCSI_PARAM_FIRST_BURST);
+iscsi_session_attr(max_burst_len, ISCSI_PARAM_MAX_BURST);
+iscsi_session_attr(data_pdu_in_order, ISCSI_PARAM_PDU_INORDER_EN);
+iscsi_session_attr(data_seq_in_order, ISCSI_PARAM_DATASEQ_INORDER_EN);
+iscsi_session_attr(erl, ISCSI_PARAM_ERL);
+iscsi_session_attr(tpgt, ISCSI_PARAM_TPGT);
 
-/*
- * Private session and conn attrs. userspace uses several iscsi values
- * to identify each session between reboots. Some of these values may not
- * be present in the iscsi_transport/LLD driver becuase userspace handles
- * login (and failback for login redirect) so for these type of drivers
- * the class manages the attrs and values for the iscsi_transport/LLD
- */
 #define iscsi_priv_session_attr_show(field, format)			\
 static ssize_t								\
-show_priv_session_##field(struct class_device *cdev, char *buf)	\
+show_priv_session_##field(struct class_device *cdev, char *buf)		\
 {									\
-	struct iscsi_cls_session *session = iscsi_cdev_to_session(cdev); \
+	struct iscsi_cls_session *session = iscsi_cdev_to_session(cdev);\
 	return sprintf(buf, format"\n", session->field);		\
 }
 
@@ -1277,31 +1193,15 @@ show_priv_session_##field(struct class_device *cdev, char *buf)	\
 	iscsi_priv_session_attr_show(field, format)			\
 static ISCSI_CLASS_ATTR(priv_sess, field, S_IRUGO, show_priv_session_##field, \
 			NULL)
-iscsi_priv_session_attr(targetname, "%s");
-iscsi_priv_session_attr(tpgt, "%d");
 iscsi_priv_session_attr(recovery_tmo, "%d");
 
-#define iscsi_priv_conn_attr_show(field, format)			\
-static ssize_t								\
-show_priv_conn_##field(struct class_device *cdev, char *buf)		\
-{									\
-	struct iscsi_cls_conn *conn = iscsi_cdev_to_conn(cdev);		\
-	return sprintf(buf, format"\n", conn->field);			\
-}
-
-#define iscsi_priv_conn_attr(field, format)				\
-	iscsi_priv_conn_attr_show(field, format)			\
-static ISCSI_CLASS_ATTR(priv_conn, field, S_IRUGO, show_priv_conn_##field, \
-			NULL)
-iscsi_priv_conn_attr(persistent_address, "%s");
-iscsi_priv_conn_attr(persistent_port, "%d");
-
 #define SETUP_PRIV_SESSION_RD_ATTR(field)				\
 do {									\
 	priv->session_attrs[count] = &class_device_attr_priv_sess_##field; \
 	count++;							\
 } while (0)
 
+
 #define SETUP_SESSION_RD_ATTR(field, param_flag)			\
 do {									\
 	if (tt->param_mask & param_flag) {				\
@@ -1310,12 +1210,6 @@ do {									\
 	}								\
 } while (0)
 
-#define SETUP_PRIV_CONN_RD_ATTR(field)					\
-do {									\
-	priv->conn_attrs[count] = &class_device_attr_priv_conn_##field; \
-	count++;							\
-} while (0)
-
 #define SETUP_CONN_RD_ATTR(field, param_flag)				\
 do {									\
 	if (tt->param_mask & param_flag) {				\
@@ -1442,16 +1336,8 @@ iscsi_register_transport(struct iscsi_transport *tt)
 	SETUP_CONN_RD_ATTR(address, ISCSI_CONN_ADDRESS);
 	SETUP_CONN_RD_ATTR(port, ISCSI_CONN_PORT);
 	SETUP_CONN_RD_ATTR(exp_statsn, ISCSI_EXP_STATSN);
-
-	if (tt->param_mask & ISCSI_PERSISTENT_ADDRESS)
-		SETUP_CONN_RD_ATTR(persistent_address, ISCSI_PERSISTENT_ADDRESS);
-	else
-		SETUP_PRIV_CONN_RD_ATTR(persistent_address);
-
-	if (tt->param_mask & ISCSI_PERSISTENT_PORT)
-		SETUP_CONN_RD_ATTR(persistent_port, ISCSI_PERSISTENT_PORT);
-	else
-		SETUP_PRIV_CONN_RD_ATTR(persistent_port);
+	SETUP_CONN_RD_ATTR(persistent_address, ISCSI_PERSISTENT_ADDRESS);
+	SETUP_CONN_RD_ATTR(persistent_port, ISCSI_PERSISTENT_PORT);
 
 	BUG_ON(count > ISCSI_CONN_ATTRS);
 	priv->conn_attrs[count] = NULL;
@@ -1471,18 +1357,10 @@ iscsi_register_transport(struct iscsi_transport *tt)
 	SETUP_SESSION_RD_ATTR(data_pdu_in_order, ISCSI_PDU_INORDER_EN);
 	SETUP_SESSION_RD_ATTR(data_seq_in_order, ISCSI_DATASEQ_INORDER_EN);
 	SETUP_SESSION_RD_ATTR(erl, ISCSI_ERL);
+	SETUP_SESSION_RD_ATTR(targetname, ISCSI_TARGET_NAME);
+	SETUP_SESSION_RD_ATTR(tpgt, ISCSI_TPGT);
 	SETUP_PRIV_SESSION_RD_ATTR(recovery_tmo);
 
-	if (tt->param_mask & ISCSI_TARGET_NAME)
-		SETUP_SESSION_RD_ATTR(targetname, ISCSI_TARGET_NAME);
-	else
-		SETUP_PRIV_SESSION_RD_ATTR(targetname);
-
-	if (tt->param_mask & ISCSI_TPGT)
-		SETUP_SESSION_RD_ATTR(tpgt, ISCSI_TPGT);
-	else
-		SETUP_PRIV_SESSION_RD_ATTR(tpgt);
-
 	BUG_ON(count > ISCSI_SESSION_ATTRS);
 	priv->session_attrs[count] = NULL;
 
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index cbf7e58bd6f9..ba2760802ded 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -157,6 +157,11 @@ struct iscsi_conn {
 	int			max_xmit_dlength; /* target_max_recv_dsl */
 	int			hdrdgst_en;
 	int			datadgst_en;
+	int			ifmarker_en;
+	int			ofmarker_en;
+	/* values userspace uses to id a conn */
+	int			persistent_port;
+	char			*persistent_address;
 
 	/* MIB-statistics */
 	uint64_t		txdata_octets;
@@ -196,8 +201,8 @@ struct iscsi_session {
 	int			pdu_inorder_en;
 	int			dataseq_inorder_en;
 	int			erl;
-	int			ifmarker_en;
-	int			ofmarker_en;
+	int			tpgt;
+	char			*targetname;
 
 	/* control data */
 	struct iscsi_transport	*tt;
@@ -240,6 +245,10 @@ iscsi_session_setup(struct iscsi_transport *, struct scsi_transport_template *,
 extern void iscsi_session_teardown(struct iscsi_cls_session *);
 extern struct iscsi_session *class_to_transport_session(struct iscsi_cls_session *);
 extern void iscsi_session_recovery_timedout(struct iscsi_cls_session *);
+extern int iscsi_set_param(struct iscsi_cls_conn *cls_conn,
+			   enum iscsi_param param, char *buf, int buflen);
+extern int iscsi_session_get_param(struct iscsi_cls_session *cls_session,
+				   enum iscsi_param param, char *buf);
 
 #define session_to_cls(_sess) \
 	hostdata_session(_sess->host->hostdata)
@@ -255,6 +264,8 @@ extern void iscsi_conn_stop(struct iscsi_cls_conn *, int);
 extern int iscsi_conn_bind(struct iscsi_cls_session *, struct iscsi_cls_conn *,
 			   int);
 extern void iscsi_conn_failure(struct iscsi_conn *conn, enum iscsi_err err);
+extern int iscsi_conn_get_param(struct iscsi_cls_conn *cls_conn,
+				enum iscsi_param param, char *buf);
 
 /*
  * pdu and task processing
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index b95151aec602..05397058a9b8 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -34,6 +34,7 @@ struct iscsi_cls_conn;
 struct iscsi_conn;
 struct iscsi_cmd_task;
 struct iscsi_mgmt_task;
+struct sockaddr;
 
 /**
  * struct iscsi_transport - iSCSI Transport template
@@ -46,7 +47,12 @@ struct iscsi_mgmt_task;
  * @bind_conn:		associate this connection with existing iSCSI session
  *			and specified transport descriptor
  * @destroy_conn:	destroy inactive iSCSI connection
- * @set_param:		set iSCSI Data-Path operational parameter
+ * @set_param:		set iSCSI parameter. Return 0 on success, -ENODATA
+ *			when param is not supported, and a -Exx value on other
+ *			error.
+ * @get_param		get iSCSI parameter. Must return number of bytes
+ *			copied to buffer on success, -ENODATA when param
+ *			is not supported, and a -Exx value on other error
  * @start_conn:		set connection to be operational
  * @stop_conn:		suspend/recover/terminate connection
  * @send_pdu:		send iSCSI PDU, Login, Logout, NOP-Out, Reject, Text.
@@ -97,15 +103,11 @@ struct iscsi_transport {
 	void (*stop_conn) (struct iscsi_cls_conn *conn, int flag);
 	void (*destroy_conn) (struct iscsi_cls_conn *conn);
 	int (*set_param) (struct iscsi_cls_conn *conn, enum iscsi_param param,
-			  uint32_t value);
+			  char *buf, int buflen);
 	int (*get_conn_param) (struct iscsi_cls_conn *conn,
-			       enum iscsi_param param, uint32_t *value);
+			       enum iscsi_param param, char *buf);
 	int (*get_session_param) (struct iscsi_cls_session *session,
-				  enum iscsi_param param, uint32_t *value);
-	int (*get_conn_str_param) (struct iscsi_cls_conn *conn,
-				   enum iscsi_param param, char *buf);
-	int (*get_session_str_param) (struct iscsi_cls_session *session,
-				      enum iscsi_param param, char *buf);
+				  enum iscsi_param param, char *buf);
 	int (*send_pdu) (struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr,
 			 char *data, uint32_t data_size);
 	void (*get_stats) (struct iscsi_cls_conn *conn,
@@ -157,13 +159,6 @@ struct iscsi_cls_conn {
 	struct iscsi_transport *transport;
 	uint32_t cid;			/* connection id */
 
-	/* portal/group values we got during discovery */
-	char *persistent_address;
-	int persistent_port;
-	/* portal/group values we are currently using */
-	char *address;
-	int port;
-
 	int active;			/* must be accessed with the connlock */
 	struct device dev;		/* sysfs transport/container device */
 	struct mempool_zone *z_error;
@@ -187,10 +182,6 @@ struct iscsi_cls_session {
 	struct list_head host_list;
 	struct iscsi_transport *transport;
 
-	/* iSCSI values used as unique id by userspace. */
-	char *targetname;
-	int tpgt;
-
 	/* recovery fields */
 	int recovery_tmo;
 	struct work_struct recovery_work;
-- 
cgit v1.2.3


From e6f3b63f50b4bb9fdc9025e0c3994acd265ad3a2 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 28 Jun 2006 12:00:29 -0500
Subject: [SCSI] iscsi: rm channel usage from iscsi

I do not remember what I was thinking when we added the channel
as a argument to the session create function. It was probably
due to too much cut and paste work from the FC transport class.

The channel is meaningless for iscsi drivers so this patch drops
its usage everywhere in the iscsi related code.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/libiscsi.c             | 2 +-
 drivers/scsi/scsi_transport_iscsi.c | 8 +++-----
 include/scsi/scsi_transport_iscsi.h | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 7e6e031cc41b..499e79f0cac5 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1290,7 +1290,7 @@ iscsi_session_setup(struct iscsi_transport *iscsit,
 	if (!try_module_get(iscsit->owner))
 		goto cls_session_fail;
 
-	cls_session = iscsi_create_session(shost, iscsit, 0);
+	cls_session = iscsi_create_session(shost, iscsit);
 	if (!cls_session)
 		goto module_put;
 	*(unsigned long*)shost->hostdata = (unsigned long)cls_session;
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 8717ff51ba4b..c0ec502835ee 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -248,10 +248,9 @@ static int iscsi_user_scan(struct Scsi_Host *shost, uint channel,
 
 	mutex_lock(&ihost->mutex);
 	list_for_each_entry(session, &ihost->sessions, host_list) {
-		if ((channel == SCAN_WILD_CARD ||
-		     channel == session->channel) &&
+		if ((channel == SCAN_WILD_CARD || channel == 0) &&
 		    (id == SCAN_WILD_CARD || id == session->target_id))
-			scsi_scan_target(&session->dev, session->channel,
+			scsi_scan_target(&session->dev, 0,
 					 session->target_id, lun, 1);
 	}
 	mutex_unlock(&ihost->mutex);
@@ -297,7 +296,7 @@ EXPORT_SYMBOL_GPL(iscsi_block_session);
  **/
 struct iscsi_cls_session *
 iscsi_create_session(struct Scsi_Host *shost,
-		     struct iscsi_transport *transport, int channel)
+		     struct iscsi_transport *transport)
 {
 	struct iscsi_host *ihost;
 	struct iscsi_cls_session *session;
@@ -322,7 +321,6 @@ iscsi_create_session(struct Scsi_Host *shost,
 	ihost = shost->shost_data;
 
 	session->sid = iscsi_session_nr++;
-	session->channel = channel;
 	session->target_id = ihost->next_target_id++;
 
 	snprintf(session->dev.bus_id, BUS_ID_SIZE, "session%u",
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 05397058a9b8..2e3cb37af047 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -187,7 +187,6 @@ struct iscsi_cls_session {
 	struct work_struct recovery_work;
 
 	int target_id;
-	int channel;
 
 	int sid;				/* session id */
 	void *dd_data;				/* LLD private data */
@@ -210,7 +209,7 @@ struct iscsi_host {
  * session and connection functions that can be used by HW iSCSI LLDs
  */
 extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
-				struct iscsi_transport *t, int channel);
+						struct iscsi_transport *t);
 extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
 					    uint32_t cid);
-- 
cgit v1.2.3


From 8434aa8b6fe5af27a33b8aa830c24e3680356c83 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 28 Jun 2006 12:00:30 -0500
Subject: [SCSI] iscsi: break up session creation into two stages

qla4xxx is initialized in two steps like other HW drivers.
It allocates the host, sets up the HW, then adds the host.
For iscsi part of HW setup is setting up persistent iscsi
sessions. At that time, the interupts are off and the driver
is not completely set up so we just want to allocate them.
We do not want to add them to sysfs and expose them to userspace
because userspace could try to do lots of fun things with them
like scanning and at that time the driver is not ready.

So this patch breakes up the session creation like other
functions that use the driver model in two the alloc
and add parts. When the driver is ready, it can then add
the sessions and userspace can begin using them.

This also fixes a bug in the addition error patch where
we forgot to do a get on the session.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 97 ++++++++++++++++++++++++++-----------
 include/scsi/scsi_transport_iscsi.h |  8 +++
 2 files changed, 76 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index c0ec502835ee..f39da0cf5f18 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -287,20 +287,11 @@ void iscsi_block_session(struct iscsi_cls_session *session)
 }
 EXPORT_SYMBOL_GPL(iscsi_block_session);
 
-/**
- * iscsi_create_session - create iscsi class session
- * @shost: scsi host
- * @transport: iscsi transport
- *
- * This can be called from a LLD or iscsi_transport.
- **/
 struct iscsi_cls_session *
-iscsi_create_session(struct Scsi_Host *shost,
-		     struct iscsi_transport *transport)
+iscsi_alloc_session(struct Scsi_Host *shost,
+		    struct iscsi_transport *transport)
 {
-	struct iscsi_host *ihost;
 	struct iscsi_cls_session *session;
-	int err;
 
 	session = kzalloc(sizeof(*session) + transport->sessiondata_size,
 			  GFP_KERNEL);
@@ -313,8 +304,20 @@ iscsi_create_session(struct Scsi_Host *shost,
 	INIT_LIST_HEAD(&session->host_list);
 	INIT_LIST_HEAD(&session->sess_list);
 
+	session->dev.parent = &shost->shost_gendev;
+	session->dev.release = iscsi_session_release;
+	device_initialize(&session->dev);
 	if (transport->sessiondata_size)
 		session->dd_data = &session[1];
+	return session;
+}
+EXPORT_SYMBOL_GPL(iscsi_alloc_session);
+
+int iscsi_add_session(struct iscsi_cls_session *session)
+{
+	struct Scsi_Host *shost = iscsi_session_to_shost(session);
+	struct iscsi_host *ihost;
+	int err;
 
 	/* this is released in the dev's release function */
 	scsi_host_get(shost);
@@ -325,37 +328,51 @@ iscsi_create_session(struct Scsi_Host *shost,
 
 	snprintf(session->dev.bus_id, BUS_ID_SIZE, "session%u",
 		 session->sid);
-	session->dev.parent = &shost->shost_gendev;
-	session->dev.release = iscsi_session_release;
-	err = device_register(&session->dev);
+	err = device_add(&session->dev);
 	if (err) {
 		dev_printk(KERN_ERR, &session->dev, "iscsi: could not "
 			   "register session's dev\n");
-		goto free_session;
+		goto release_host;
 	}
 	transport_register_device(&session->dev);
 
 	mutex_lock(&ihost->mutex);
 	list_add(&session->host_list, &ihost->sessions);
 	mutex_unlock(&ihost->mutex);
+	return 0;
 
-	return session;
-
-free_session:
-	kfree(session);
-	return NULL;
+release_host:
+	scsi_host_put(shost);
+	return err;
 }
-
-EXPORT_SYMBOL_GPL(iscsi_create_session);
+EXPORT_SYMBOL_GPL(iscsi_add_session);
 
 /**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
+ * iscsi_create_session - create iscsi class session
+ * @shost: scsi host
+ * @transport: iscsi transport
  *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
+ * This can be called from a LLD or iscsi_transport.
  **/
-int iscsi_destroy_session(struct iscsi_cls_session *session)
+struct iscsi_cls_session *
+iscsi_create_session(struct Scsi_Host *shost,
+		     struct iscsi_transport *transport)
+{
+	struct iscsi_cls_session *session;
+
+	session = iscsi_alloc_session(shost, transport);
+	if (!session)
+		return NULL;
+
+	if (iscsi_add_session(session)) {
+		iscsi_free_session(session);
+		return NULL;
+	}
+	return session;
+}
+EXPORT_SYMBOL_GPL(iscsi_create_session);
+
+void iscsi_remove_session(struct iscsi_cls_session *session)
 {
 	struct Scsi_Host *shost = iscsi_session_to_shost(session);
 	struct iscsi_host *ihost = shost->shost_data;
@@ -367,11 +384,33 @@ int iscsi_destroy_session(struct iscsi_cls_session *session)
 	list_del(&session->host_list);
 	mutex_unlock(&ihost->mutex);
 
+	scsi_remove_target(&session->dev);
+
 	transport_unregister_device(&session->dev);
-	device_unregister(&session->dev);
-	return 0;
+	device_del(&session->dev);
 }
+EXPORT_SYMBOL_GPL(iscsi_remove_session);
 
+void iscsi_free_session(struct iscsi_cls_session *session)
+{
+	put_device(&session->dev);
+}
+
+EXPORT_SYMBOL_GPL(iscsi_free_session);
+
+/**
+ * iscsi_destroy_session - destroy iscsi session
+ * @session: iscsi_session
+ *
+ * Can be called by a LLD or iscsi_transport. There must not be
+ * any running connections.
+ **/
+int iscsi_destroy_session(struct iscsi_cls_session *session)
+{
+	iscsi_remove_session(session);
+	iscsi_free_session(session);
+	return 0;
+}
 EXPORT_SYMBOL_GPL(iscsi_destroy_session);
 
 static void iscsi_conn_release(struct device *dev)
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 2e3cb37af047..53493d591355 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -199,6 +199,9 @@ struct iscsi_cls_session {
 #define iscsi_session_to_shost(_session) \
 	dev_to_shost(_session->dev.parent)
 
+#define starget_to_session(_stgt) \
+	iscsi_dev_to_session(_stgt->dev.parent)
+
 struct iscsi_host {
 	int next_target_id;
 	struct list_head sessions;
@@ -208,8 +211,13 @@ struct iscsi_host {
 /*
  * session and connection functions that can be used by HW iSCSI LLDs
  */
+extern struct iscsi_cls_session *iscsi_alloc_session(struct Scsi_Host *shost,
+					struct iscsi_transport *transport);
+extern int iscsi_add_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
 						struct iscsi_transport *t);
+extern void iscsi_remove_session(struct iscsi_cls_session *session);
+extern void iscsi_free_session(struct iscsi_cls_session *session);
 extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
 					    uint32_t cid);
-- 
cgit v1.2.3


From 6a8a0d3621745279a131d95f0204dc9ddac60d55 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 28 Jun 2006 12:00:31 -0500
Subject: [SCSI] iscsi: pass target nr to session creation

So the drivers do not use the channel numbers, but some do
use the target numbers. We were just adding some goofy
variable that just increases for the target nr. This is useless
for software iscsi because it is always zero. And for qla4xxx
the target nr is actually the index of the target/session
in its FW or FLASH tables. We needed to expose this to userspace
so apps could access those numbers so this patch just adds the
target nr to the iscsi session creation functions. This way
when qla4xxx's Hw thinks a session is at target nr 4
in its hw, it is exposed as that number in sysfs.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/libiscsi.c             |  2 +-
 drivers/scsi/scsi_transport_iscsi.c | 14 +++++++-------
 include/scsi/scsi_transport_iscsi.h |  7 ++++---
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 499e79f0cac5..7e6e031cc41b 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1290,7 +1290,7 @@ iscsi_session_setup(struct iscsi_transport *iscsit,
 	if (!try_module_get(iscsit->owner))
 		goto cls_session_fail;
 
-	cls_session = iscsi_create_session(shost, iscsit);
+	cls_session = iscsi_create_session(shost, iscsit, 0);
 	if (!cls_session)
 		goto module_put;
 	*(unsigned long*)shost->hostdata = (unsigned long)cls_session;
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index f39da0cf5f18..7963c0538de5 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -304,6 +304,8 @@ iscsi_alloc_session(struct Scsi_Host *shost,
 	INIT_LIST_HEAD(&session->host_list);
 	INIT_LIST_HEAD(&session->sess_list);
 
+	/* this is released in the dev's release function */
+	scsi_host_get(shost);
 	session->dev.parent = &shost->shost_gendev;
 	session->dev.release = iscsi_session_release;
 	device_initialize(&session->dev);
@@ -313,18 +315,15 @@ iscsi_alloc_session(struct Scsi_Host *shost,
 }
 EXPORT_SYMBOL_GPL(iscsi_alloc_session);
 
-int iscsi_add_session(struct iscsi_cls_session *session)
+int iscsi_add_session(struct iscsi_cls_session *session, unsigned int target_id)
 {
 	struct Scsi_Host *shost = iscsi_session_to_shost(session);
 	struct iscsi_host *ihost;
 	int err;
 
-	/* this is released in the dev's release function */
-	scsi_host_get(shost);
 	ihost = shost->shost_data;
-
 	session->sid = iscsi_session_nr++;
-	session->target_id = ihost->next_target_id++;
+	session->target_id = target_id;
 
 	snprintf(session->dev.bus_id, BUS_ID_SIZE, "session%u",
 		 session->sid);
@@ -356,7 +355,8 @@ EXPORT_SYMBOL_GPL(iscsi_add_session);
  **/
 struct iscsi_cls_session *
 iscsi_create_session(struct Scsi_Host *shost,
-		     struct iscsi_transport *transport)
+		     struct iscsi_transport *transport,
+		     unsigned int target_id)
 {
 	struct iscsi_cls_session *session;
 
@@ -364,7 +364,7 @@ iscsi_create_session(struct Scsi_Host *shost,
 	if (!session)
 		return NULL;
 
-	if (iscsi_add_session(session)) {
+	if (iscsi_add_session(session, target_id)) {
 		iscsi_free_session(session);
 		return NULL;
 	}
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 53493d591355..f7b0db5f2f5b 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -203,7 +203,6 @@ struct iscsi_cls_session {
 	iscsi_dev_to_session(_stgt->dev.parent)
 
 struct iscsi_host {
-	int next_target_id;
 	struct list_head sessions;
 	struct mutex mutex;
 };
@@ -213,9 +212,11 @@ struct iscsi_host {
  */
 extern struct iscsi_cls_session *iscsi_alloc_session(struct Scsi_Host *shost,
 					struct iscsi_transport *transport);
-extern int iscsi_add_session(struct iscsi_cls_session *session);
+extern int iscsi_add_session(struct iscsi_cls_session *session,
+			     unsigned int target_id);
 extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
-						struct iscsi_transport *t);
+						struct iscsi_transport *t,
+						unsigned int target_id);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
 extern int iscsi_destroy_session(struct iscsi_cls_session *session);
-- 
cgit v1.2.3


From 53cb8a1f45e06a2627a6d89b151cccb95fa45cbf Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Wed, 28 Jun 2006 12:00:32 -0500
Subject: [SCSI] iscsi: add async notification of session events

This patch adds or modifies the transport class functions
used to notify userspace of session state events.

We modify the session addition up event and add a destruction event
to notify userspace of session creation, relogin and destruction.

And we modify the conn error event to be sent by broadcast
since multiple listeners may want to listen for it.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 309 ++++++++++++++++++++++++++----------
 include/scsi/iscsi_if.h             |  23 +--
 include/scsi/scsi_transport_iscsi.h |   3 +
 3 files changed, 235 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 7963c0538de5..7b9e8fa1a4e0 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -413,11 +413,59 @@ int iscsi_destroy_session(struct iscsi_cls_session *session)
 }
 EXPORT_SYMBOL_GPL(iscsi_destroy_session);
 
+static void mempool_zone_destroy(struct mempool_zone *zp)
+{
+	mempool_destroy(zp->pool);
+	kfree(zp);
+}
+
+static void*
+mempool_zone_alloc_skb(gfp_t gfp_mask, void *pool_data)
+{
+	struct mempool_zone *zone = pool_data;
+
+	return alloc_skb(zone->size, gfp_mask);
+}
+
+static void
+mempool_zone_free_skb(void *element, void *pool_data)
+{
+	kfree_skb(element);
+}
+
+static struct mempool_zone *
+mempool_zone_init(unsigned max, unsigned size, unsigned hiwat)
+{
+	struct mempool_zone *zp;
+
+	zp = kzalloc(sizeof(*zp), GFP_KERNEL);
+	if (!zp)
+		return NULL;
+
+	zp->size = size;
+	zp->hiwat = hiwat;
+	INIT_LIST_HEAD(&zp->freequeue);
+	spin_lock_init(&zp->freelock);
+	atomic_set(&zp->allocated, 0);
+
+	zp->pool = mempool_create(max, mempool_zone_alloc_skb,
+				  mempool_zone_free_skb, zp);
+	if (!zp->pool) {
+		kfree(zp);
+		return NULL;
+	}
+
+	return zp;
+}
+
 static void iscsi_conn_release(struct device *dev)
 {
 	struct iscsi_cls_conn *conn = iscsi_dev_to_conn(dev);
 	struct device *parent = conn->dev.parent;
 
+	mempool_zone_destroy(conn->z_pdu);
+	mempool_zone_destroy(conn->z_error);
+
 	kfree(conn);
 	put_device(parent);
 }
@@ -427,6 +475,31 @@ static int iscsi_is_conn_dev(const struct device *dev)
 	return dev->release == iscsi_conn_release;
 }
 
+static int iscsi_create_event_pools(struct iscsi_cls_conn *conn)
+{
+	conn->z_pdu = mempool_zone_init(Z_MAX_PDU,
+			NLMSG_SPACE(sizeof(struct iscsi_uevent) +
+				    sizeof(struct iscsi_hdr) +
+				    DEFAULT_MAX_RECV_DATA_SEGMENT_LENGTH),
+			Z_HIWAT_PDU);
+	if (!conn->z_pdu) {
+		dev_printk(KERN_ERR, &conn->dev, "iscsi: can not allocate "
+			   "pdu zone for new conn\n");
+		return -ENOMEM;
+	}
+
+	conn->z_error = mempool_zone_init(Z_MAX_ERROR,
+			NLMSG_SPACE(sizeof(struct iscsi_uevent)),
+			Z_HIWAT_ERROR);
+	if (!conn->z_error) {
+		dev_printk(KERN_ERR, &conn->dev, "iscsi: can not allocate "
+			   "error zone for new conn\n");
+		mempool_zone_destroy(conn->z_pdu);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 /**
  * iscsi_create_conn - create iscsi class connection
  * @session: iscsi cls session
@@ -459,9 +532,12 @@ iscsi_create_conn(struct iscsi_cls_session *session, uint32_t cid)
 	conn->transport = transport;
 	conn->cid = cid;
 
+	if (iscsi_create_event_pools(conn))
+		goto free_conn;
+
 	/* this is released in the dev's release function */
 	if (!get_device(&session->dev))
-		goto free_conn;
+		goto free_conn_pools;
 
 	snprintf(conn->dev.bus_id, BUS_ID_SIZE, "connection%d:%u",
 		 session->sid, cid);
@@ -478,6 +554,8 @@ iscsi_create_conn(struct iscsi_cls_session *session, uint32_t cid)
 
 release_parent_ref:
 	put_device(&session->dev);
+free_conn_pools:
+
 free_conn:
 	kfree(conn);
 	return NULL;
@@ -525,20 +603,6 @@ static inline struct list_head *skb_to_lh(struct sk_buff *skb)
 	return (struct list_head *)&skb->cb;
 }
 
-static void*
-mempool_zone_alloc_skb(gfp_t gfp_mask, void *pool_data)
-{
-	struct mempool_zone *zone = pool_data;
-
-	return alloc_skb(zone->size, gfp_mask);
-}
-
-static void
-mempool_zone_free_skb(void *element, void *pool_data)
-{
-	kfree_skb(element);
-}
-
 static void
 mempool_zone_complete(struct mempool_zone *zone)
 {
@@ -558,37 +622,6 @@ mempool_zone_complete(struct mempool_zone *zone)
 	spin_unlock_irqrestore(&zone->freelock, flags);
 }
 
-static struct mempool_zone *
-mempool_zone_init(unsigned max, unsigned size, unsigned hiwat)
-{
-	struct mempool_zone *zp;
-
-	zp = kzalloc(sizeof(*zp), GFP_KERNEL);
-	if (!zp)
-		return NULL;
-
-	zp->size = size;
-	zp->hiwat = hiwat;
-	INIT_LIST_HEAD(&zp->freequeue);
-	spin_lock_init(&zp->freelock);
-	atomic_set(&zp->allocated, 0);
-
-	zp->pool = mempool_create(max, mempool_zone_alloc_skb,
-				  mempool_zone_free_skb, zp);
-	if (!zp->pool) {
-		kfree(zp);
-		return NULL;
-	}
-
-	return zp;
-}
-
-static void mempool_zone_destroy(struct mempool_zone *zp)
-{
-	mempool_destroy(zp->pool);
-	kfree(zp);
-}
-
 static struct sk_buff*
 mempool_zone_get_skb(struct mempool_zone *zone)
 {
@@ -600,6 +633,27 @@ mempool_zone_get_skb(struct mempool_zone *zone)
 	return skb;
 }
 
+static int
+iscsi_broadcast_skb(struct mempool_zone *zone, struct sk_buff *skb)
+{
+	unsigned long flags;
+	int rc;
+
+	skb_get(skb);
+	rc = netlink_broadcast(nls, skb, 0, 1, GFP_KERNEL);
+	if (rc < 0) {
+		mempool_free(skb, zone->pool);
+		printk(KERN_ERR "iscsi: can not broadcast skb (%d)\n", rc);
+		return rc;
+	}
+
+	spin_lock_irqsave(&zone->freelock, flags);
+	INIT_LIST_HEAD(skb_to_lh(skb));
+	list_add(skb_to_lh(skb), &zone->freequeue);
+	spin_unlock_irqrestore(&zone->freelock, flags);
+	return 0;
+}
+
 static int
 iscsi_unicast_skb(struct mempool_zone *zone, struct sk_buff *skb, int pid)
 {
@@ -695,7 +749,7 @@ void iscsi_conn_error(struct iscsi_cls_conn *conn, enum iscsi_err error)
 	ev->r.connerror.cid = conn->cid;
 	ev->r.connerror.sid = iscsi_conn_get_sid(conn);
 
-	iscsi_unicast_skb(conn->z_error, skb, priv->daemon_pid);
+	iscsi_broadcast_skb(conn->z_error, skb);
 
 	dev_printk(KERN_INFO, &conn->dev, "iscsi: detected conn error (%d)\n",
 		   error);
@@ -796,6 +850,131 @@ iscsi_if_get_stats(struct iscsi_transport *transport, struct nlmsghdr *nlh)
 	return err;
 }
 
+/**
+ * iscsi_if_destroy_session_done - send session destr. completion event
+ * @conn: last connection for session
+ *
+ * This is called by HW iscsi LLDs to notify userpsace that its HW has
+ * removed a session.
+ **/
+int iscsi_if_destroy_session_done(struct iscsi_cls_conn *conn)
+{
+	struct iscsi_internal *priv;
+	struct iscsi_cls_session *session;
+	struct Scsi_Host *shost;
+	struct iscsi_uevent *ev;
+	struct sk_buff  *skb;
+	struct nlmsghdr *nlh;
+	unsigned long flags;
+	int rc, len = NLMSG_SPACE(sizeof(*ev));
+
+	priv = iscsi_if_transport_lookup(conn->transport);
+	if (!priv)
+		return -EINVAL;
+
+	session = iscsi_dev_to_session(conn->dev.parent);
+	shost = iscsi_session_to_shost(session);
+
+	mempool_zone_complete(conn->z_pdu);
+
+	skb = mempool_zone_get_skb(conn->z_pdu);
+	if (!skb) {
+		dev_printk(KERN_ERR, &conn->dev, "Cannot notify userspace of "
+			  "session creation event\n");
+		return -ENOMEM;
+	}
+
+	nlh = __nlmsg_put(skb, priv->daemon_pid, 0, 0, (len - sizeof(*nlh)), 0);
+	ev = NLMSG_DATA(nlh);
+	ev->transport_handle = iscsi_handle(conn->transport);
+	ev->type = ISCSI_KEVENT_DESTROY_SESSION;
+	ev->r.d_session.host_no = shost->host_no;
+	ev->r.d_session.sid = session->sid;
+
+	/*
+	 * this will occur if the daemon is not up, so we just warn
+	 * the user and when the daemon is restarted it will handle it
+	 */
+	rc = iscsi_broadcast_skb(conn->z_pdu, skb);
+	if (rc < 0)
+		dev_printk(KERN_ERR, &conn->dev, "Cannot notify userspace of "
+			  "session destruction event. Check iscsi daemon\n");
+
+	spin_lock_irqsave(&sesslock, flags);
+	list_del(&session->sess_list);
+	spin_unlock_irqrestore(&sesslock, flags);
+
+	spin_lock_irqsave(&connlock, flags);
+	conn->active = 0;
+	list_del(&conn->conn_list);
+	spin_unlock_irqrestore(&connlock, flags);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(iscsi_if_destroy_session_done);
+
+/**
+ * iscsi_if_create_session_done - send session creation completion event
+ * @conn: leading connection for session
+ *
+ * This is called by HW iscsi LLDs to notify userpsace that its HW has
+ * created a session or a existing session is back in the logged in state.
+ **/
+int iscsi_if_create_session_done(struct iscsi_cls_conn *conn)
+{
+	struct iscsi_internal *priv;
+	struct iscsi_cls_session *session;
+	struct Scsi_Host *shost;
+	struct iscsi_uevent *ev;
+	struct sk_buff  *skb;
+	struct nlmsghdr *nlh;
+	unsigned long flags;
+	int rc, len = NLMSG_SPACE(sizeof(*ev));
+
+	priv = iscsi_if_transport_lookup(conn->transport);
+	if (!priv)
+		return -EINVAL;
+
+	session = iscsi_dev_to_session(conn->dev.parent);
+	shost = iscsi_session_to_shost(session);
+
+	mempool_zone_complete(conn->z_pdu);
+
+	skb = mempool_zone_get_skb(conn->z_pdu);
+	if (!skb) {
+		dev_printk(KERN_ERR, &conn->dev, "Cannot notify userspace of "
+			  "session creation event\n");
+		return -ENOMEM;
+	}
+
+	nlh = __nlmsg_put(skb, priv->daemon_pid, 0, 0, (len - sizeof(*nlh)), 0);
+	ev = NLMSG_DATA(nlh);
+	ev->transport_handle = iscsi_handle(conn->transport);
+	ev->type = ISCSI_UEVENT_CREATE_SESSION;
+	ev->r.c_session_ret.host_no = shost->host_no;
+	ev->r.c_session_ret.sid = session->sid;
+
+	/*
+	 * this will occur if the daemon is not up, so we just warn
+	 * the user and when the daemon is restarted it will handle it
+	 */
+	rc = iscsi_broadcast_skb(conn->z_pdu, skb);
+	if (rc < 0)
+		dev_printk(KERN_ERR, &conn->dev, "Cannot notify userspace of "
+			  "session creation event. Check iscsi daemon\n");
+
+	spin_lock_irqsave(&sesslock, flags);
+	list_add(&session->sess_list, &sesslist);
+	spin_unlock_irqrestore(&sesslock, flags);
+
+	spin_lock_irqsave(&connlock, flags);
+	list_add(&conn->conn_list, &connlist);
+	conn->active = 1;
+	spin_unlock_irqrestore(&connlock, flags);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(iscsi_if_create_session_done);
+
 static int
 iscsi_if_create_session(struct iscsi_internal *priv, struct iscsi_uevent *ev)
 {
@@ -841,26 +1020,6 @@ iscsi_if_create_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 		return -ENOMEM;
 	}
 
-	conn->z_pdu = mempool_zone_init(Z_MAX_PDU,
-			NLMSG_SPACE(sizeof(struct iscsi_uevent) +
-				    sizeof(struct iscsi_hdr) +
-				    DEFAULT_MAX_RECV_DATA_SEGMENT_LENGTH),
-			Z_HIWAT_PDU);
-	if (!conn->z_pdu) {
-		dev_printk(KERN_ERR, &conn->dev, "iscsi: can not allocate "
-			   "pdu zone for new conn\n");
-		goto destroy_conn;
-	}
-
-	conn->z_error = mempool_zone_init(Z_MAX_ERROR,
-			NLMSG_SPACE(sizeof(struct iscsi_uevent)),
-			Z_HIWAT_ERROR);
-	if (!conn->z_error) {
-		dev_printk(KERN_ERR, &conn->dev, "iscsi: can not allocate "
-			   "error zone for new conn\n");
-		goto free_pdu_pool;
-	}
-
 	ev->r.c_conn_ret.sid = session->sid;
 	ev->r.c_conn_ret.cid = conn->cid;
 
@@ -870,13 +1029,6 @@ iscsi_if_create_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev)
 	spin_unlock_irqrestore(&connlock, flags);
 
 	return 0;
-
-free_pdu_pool:
-	mempool_zone_destroy(conn->z_pdu);
-destroy_conn:
-	if (transport->destroy_conn)
-		transport->destroy_conn(conn->dd_data);
-	return -ENOMEM;
 }
 
 static int
@@ -884,7 +1036,6 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
 {
 	unsigned long flags;
 	struct iscsi_cls_conn *conn;
-	struct mempool_zone *z_error, *z_pdu;
 
 	conn = iscsi_conn_lookup(ev->u.d_conn.sid, ev->u.d_conn.cid);
 	if (!conn)
@@ -894,15 +1045,8 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
 	list_del(&conn->conn_list);
 	spin_unlock_irqrestore(&connlock, flags);
 
-	z_pdu = conn->z_pdu;
-	z_error = conn->z_error;
-
 	if (transport->destroy_conn)
 		transport->destroy_conn(conn);
-
-	mempool_zone_destroy(z_pdu);
-	mempool_zone_destroy(z_error);
-
 	return 0;
 }
 
@@ -1331,6 +1475,7 @@ iscsi_register_transport(struct iscsi_transport *tt)
 	if (!priv)
 		return NULL;
 	INIT_LIST_HEAD(&priv->list);
+	priv->daemon_pid = -1;
 	priv->iscsi_transport = tt;
 	priv->t.user_scan = iscsi_user_scan;
 
diff --git a/include/scsi/iscsi_if.h b/include/scsi/iscsi_if.h
index 8813f0f4c624..55ebf035e620 100644
--- a/include/scsi/iscsi_if.h
+++ b/include/scsi/iscsi_if.h
@@ -53,6 +53,7 @@ enum iscsi_uevent_e {
 	ISCSI_KEVENT_RECV_PDU		= KEVENT_BASE + 1,
 	ISCSI_KEVENT_CONN_ERROR		= KEVENT_BASE + 2,
 	ISCSI_KEVENT_IF_ERROR		= KEVENT_BASE + 3,
+	ISCSI_KEVENT_DESTROY_SESSION	= KEVENT_BASE + 4,
 };
 
 enum iscsi_tgt_dscvr {
@@ -157,27 +158,13 @@ struct iscsi_uevent {
 			uint32_t	cid;
 			uint32_t	error; /* enum iscsi_err */
 		} connerror;
+		struct msg_session_destroyed {
+			uint32_t	host_no;
+			uint32_t	sid;
+		} d_session;
 		struct msg_transport_connect_ret {
 			uint64_t	handle;
 		} ep_connect_ret;
-		struct msg_tgt_dscvr_ret {
-			/*
-			 * session/connection pair used to reference
-			 * the connection to server
-			 */
-			uint32_t	sid;
-			uint32_t	cid;
-			union {
-				struct isns {
-					/* port # for conn to iSNS server */
-					uint16_t isns_port;
-					/* listening port to receive SCNs */
-					uint16_t scn_port;
-					/* listening port to receive ESIs */
-					uint16_t esi_port;
-				} isns_attrib;
-			} u;
-		} tgt_dscvr_ret;
 	} r;
 } __attribute__ ((aligned (sizeof(uint64_t))));
 
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index f7b0db5f2f5b..5a3df1d7085f 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -214,6 +214,8 @@ extern struct iscsi_cls_session *iscsi_alloc_session(struct Scsi_Host *shost,
 					struct iscsi_transport *transport);
 extern int iscsi_add_session(struct iscsi_cls_session *session,
 			     unsigned int target_id);
+extern int iscsi_if_create_session_done(struct iscsi_cls_conn *conn);
+extern int iscsi_if_destroy_session_done(struct iscsi_cls_conn *conn);
 extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
 						struct iscsi_transport *t,
 						unsigned int target_id);
@@ -226,4 +228,5 @@ extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
 extern void iscsi_unblock_session(struct iscsi_cls_session *session);
 extern void iscsi_block_session(struct iscsi_cls_session *session);
 
+
 #endif
-- 
cgit v1.2.3


From 257a5bdeb0441789d8e34e1b3e92b26d0f51bbf0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 29 Jun 2006 22:40:06 +0100
Subject: Remove export of include/linux/isdn/tpam.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/isdn/Kbuild | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/isdn/Kbuild b/include/linux/isdn/Kbuild
index c1727c893004..991cdb29ab2e 100644
--- a/include/linux/isdn/Kbuild
+++ b/include/linux/isdn/Kbuild
@@ -1 +1 @@
-header-y += capicmd.h tpam.h
+header-y += capicmd.h
-- 
cgit v1.2.3


From ccb2fe209dac9ff67f6351e783e610073afaaeaf Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 28 Jun 2006 13:49:52 -0700
Subject: [CPUFREQ] Remove slowdown from ondemand sampling path.

Remove slowdown from ondemand sampling path. This reduces the code path length
in dbs_check_cpu() by half. slowdown was not used by ondemand by default.
If there are any user level tools that were using this tunable, they
may report error now.

Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq_ondemand.c | 138 +++++++++++--------------------------
 include/asm-generic/cputime.h      |   2 +
 2 files changed, 42 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 693e540481b4..a2add11e56f1 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -56,16 +56,14 @@ static unsigned int def_sampling_rate;
 #define MIN_SAMPLING_RATE			(def_sampling_rate / MIN_SAMPLING_RATE_RATIO)
 #define MAX_SAMPLING_RATE			(500 * def_sampling_rate)
 #define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER	(1000)
-#define DEF_SAMPLING_DOWN_FACTOR		(1)
-#define MAX_SAMPLING_DOWN_FACTOR		(10)
 #define TRANSITION_LATENCY_LIMIT		(10 * 1000)
 
 static void do_dbs_timer(void *data);
 
 struct cpu_dbs_info_s {
+	cputime64_t prev_cpu_idle;
+	cputime64_t prev_cpu_wall;
 	struct cpufreq_policy *cur_policy;
-	unsigned int prev_cpu_idle_up;
-	unsigned int prev_cpu_idle_down;
 	unsigned int enable;
 };
 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
@@ -87,24 +85,26 @@ static struct workqueue_struct *dbs_workq;
 
 struct dbs_tuners {
 	unsigned int sampling_rate;
-	unsigned int sampling_down_factor;
 	unsigned int up_threshold;
 	unsigned int ignore_nice;
 };
 
 static struct dbs_tuners dbs_tuners_ins = {
 	.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
-	.sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR,
 	.ignore_nice = 0,
 };
 
-static inline unsigned int get_cpu_idle_time(unsigned int cpu)
+static inline cputime64_t get_cpu_idle_time(unsigned int cpu)
 {
-	return	kstat_cpu(cpu).cpustat.idle +
-		kstat_cpu(cpu).cpustat.iowait +
-		( dbs_tuners_ins.ignore_nice ?
-		  kstat_cpu(cpu).cpustat.nice :
-		  0);
+	cputime64_t retval;
+
+	retval = cputime64_add(kstat_cpu(cpu).cpustat.idle,
+			kstat_cpu(cpu).cpustat.iowait);
+
+	if (dbs_tuners_ins.ignore_nice)
+		retval = cputime64_add(retval, kstat_cpu(cpu).cpustat.nice);
+
+	return retval;
 }
 
 /************************** sysfs interface ************************/
@@ -133,29 +133,9 @@ static ssize_t show_##file_name						\
 	return sprintf(buf, "%u\n", dbs_tuners_ins.object);		\
 }
 show_one(sampling_rate, sampling_rate);
-show_one(sampling_down_factor, sampling_down_factor);
 show_one(up_threshold, up_threshold);
 show_one(ignore_nice_load, ignore_nice);
 
-static ssize_t store_sampling_down_factor(struct cpufreq_policy *unused,
-		const char *buf, size_t count)
-{
-	unsigned int input;
-	int ret;
-	ret = sscanf (buf, "%u", &input);
-	if (ret != 1 )
-		return -EINVAL;
-
-	if (input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
-		return -EINVAL;
-
-	mutex_lock(&dbs_mutex);
-	dbs_tuners_ins.sampling_down_factor = input;
-	mutex_unlock(&dbs_mutex);
-
-	return count;
-}
-
 static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
 		const char *buf, size_t count)
 {
@@ -217,12 +197,12 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
 	}
 	dbs_tuners_ins.ignore_nice = input;
 
-	/* we need to re-evaluate prev_cpu_idle_up and prev_cpu_idle_down */
+	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
-		struct cpu_dbs_info_s *j_dbs_info;
-		j_dbs_info = &per_cpu(cpu_dbs_info, j);
-		j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(j);
-		j_dbs_info->prev_cpu_idle_down = j_dbs_info->prev_cpu_idle_up;
+		struct cpu_dbs_info_s *dbs_info;
+		dbs_info = &per_cpu(cpu_dbs_info, j);
+		dbs_info->prev_cpu_idle = get_cpu_idle_time(j);
+		dbs_info->prev_cpu_wall = get_jiffies_64();
 	}
 	mutex_unlock(&dbs_mutex);
 
@@ -234,7 +214,6 @@ static struct freq_attr _name = \
 __ATTR(_name, 0644, show_##_name, store_##_name)
 
 define_one_rw(sampling_rate);
-define_one_rw(sampling_down_factor);
 define_one_rw(up_threshold);
 define_one_rw(ignore_nice_load);
 
@@ -242,7 +221,6 @@ static struct attribute * dbs_attributes[] = {
 	&sampling_rate_max.attr,
 	&sampling_rate_min.attr,
 	&sampling_rate.attr,
-	&sampling_down_factor.attr,
 	&up_threshold.attr,
 	&ignore_nice_load.attr,
 	NULL
@@ -257,11 +235,10 @@ static struct attribute_group dbs_attr_group = {
 
 static void dbs_check_cpu(int cpu)
 {
-	unsigned int idle_ticks, up_idle_ticks, total_ticks;
-	unsigned int freq_next;
-	unsigned int freq_down_sampling_rate;
-	static int down_skip[NR_CPUS];
+	unsigned int idle_ticks, total_ticks;
+	unsigned int load;
 	struct cpu_dbs_info_s *this_dbs_info;
+	cputime64_t cur_jiffies;
 
 	struct cpufreq_policy *policy;
 	unsigned int j;
@@ -271,10 +248,14 @@ static void dbs_check_cpu(int cpu)
 		return;
 
 	policy = this_dbs_info->cur_policy;
+	cur_jiffies = jiffies64_to_cputime64(get_jiffies_64());
+	total_ticks = (unsigned int) cputime64_sub(cur_jiffies,
+			this_dbs_info->prev_cpu_wall);
+	this_dbs_info->prev_cpu_wall = cur_jiffies;
 	/*
 	 * Every sampling_rate, we check, if current idle time is less
 	 * than 20% (default), then we try to increase frequency
-	 * Every sampling_rate*sampling_down_factor, we look for a the lowest
+	 * Every sampling_rate, we look for a the lowest
 	 * frequency which can sustain the load while keeping idle time over
 	 * 30%. If such a frequency exist, we try to decrease to this frequency.
 	 *
@@ -283,36 +264,26 @@ static void dbs_check_cpu(int cpu)
 	 * 5% (default) of current frequency
 	 */
 
-	/* Check for frequency increase */
+	/* Get Idle Time */
 	idle_ticks = UINT_MAX;
 	for_each_cpu_mask(j, policy->cpus) {
-		unsigned int tmp_idle_ticks, total_idle_ticks;
+		cputime64_t total_idle_ticks;
+		unsigned int tmp_idle_ticks;
 		struct cpu_dbs_info_s *j_dbs_info;
 
 		j_dbs_info = &per_cpu(cpu_dbs_info, j);
 		total_idle_ticks = get_cpu_idle_time(j);
-		tmp_idle_ticks = total_idle_ticks -
-			j_dbs_info->prev_cpu_idle_up;
-		j_dbs_info->prev_cpu_idle_up = total_idle_ticks;
+		tmp_idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks,
+				j_dbs_info->prev_cpu_idle);
+		j_dbs_info->prev_cpu_idle = total_idle_ticks;
 
 		if (tmp_idle_ticks < idle_ticks)
 			idle_ticks = tmp_idle_ticks;
 	}
+	load = (100 * (total_ticks - idle_ticks)) / total_ticks;
 
-	/* Scale idle ticks by 100 and compare with up and down ticks */
-	idle_ticks *= 100;
-	up_idle_ticks = (100 - dbs_tuners_ins.up_threshold) *
-			usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
-
-	if (idle_ticks < up_idle_ticks) {
-		down_skip[cpu] = 0;
-		for_each_cpu_mask(j, policy->cpus) {
-			struct cpu_dbs_info_s *j_dbs_info;
-
-			j_dbs_info = &per_cpu(cpu_dbs_info, j);
-			j_dbs_info->prev_cpu_idle_down =
-					j_dbs_info->prev_cpu_idle_up;
-		}
+	/* Check for frequency increase */
+	if (load > dbs_tuners_ins.up_threshold) {
 		/* if we are already at full speed then break out early */
 		if (policy->cur == policy->max)
 			return;
@@ -323,50 +294,22 @@ static void dbs_check_cpu(int cpu)
 	}
 
 	/* Check for frequency decrease */
-	down_skip[cpu]++;
-	if (down_skip[cpu] < dbs_tuners_ins.sampling_down_factor)
-		return;
-
-	idle_ticks = UINT_MAX;
-	for_each_cpu_mask(j, policy->cpus) {
-		unsigned int tmp_idle_ticks, total_idle_ticks;
-		struct cpu_dbs_info_s *j_dbs_info;
-
-		j_dbs_info = &per_cpu(cpu_dbs_info, j);
-		/* Check for frequency decrease */
-		total_idle_ticks = j_dbs_info->prev_cpu_idle_up;
-		tmp_idle_ticks = total_idle_ticks -
-			j_dbs_info->prev_cpu_idle_down;
-		j_dbs_info->prev_cpu_idle_down = total_idle_ticks;
-
-		if (tmp_idle_ticks < idle_ticks)
-			idle_ticks = tmp_idle_ticks;
-	}
-
-	down_skip[cpu] = 0;
 	/* if we cannot reduce the frequency anymore, break out early */
 	if (policy->cur == policy->min)
 		return;
 
-	/* Compute how many ticks there are between two measurements */
-	freq_down_sampling_rate = dbs_tuners_ins.sampling_rate *
-		dbs_tuners_ins.sampling_down_factor;
-	total_ticks = usecs_to_jiffies(freq_down_sampling_rate);
-
 	/*
 	 * The optimal frequency is the frequency that is the lowest that
 	 * can support the current CPU usage without triggering the up
 	 * policy. To be safe, we focus 10 points under the threshold.
 	 */
-	freq_next = ((total_ticks - idle_ticks) * 100) / total_ticks;
-	freq_next = (freq_next * policy->cur) /
+	if (load < (dbs_tuners_ins.up_threshold - 10)) {
+		unsigned int freq_next;
+		freq_next = (policy->cur * load) /
 			(dbs_tuners_ins.up_threshold - 10);
 
-	if (freq_next < policy->min)
-		freq_next = policy->min;
-
-	if (freq_next <= ((policy->cur * 95) / 100))
 		__cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_L);
+	}
 }
 
 static void do_dbs_timer(void *data)
@@ -432,9 +375,8 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			j_dbs_info = &per_cpu(cpu_dbs_info, j);
 			j_dbs_info->cur_policy = policy;
 
-			j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(j);
-			j_dbs_info->prev_cpu_idle_down
-				= j_dbs_info->prev_cpu_idle_up;
+			j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j);
+			j_dbs_info->prev_cpu_wall = get_jiffies_64();
 		}
 		this_dbs_info->enable = 1;
 		sysfs_create_group(&policy->kobj, &dbs_attr_group);
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 6f178563e336..09204e40d663 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -24,7 +24,9 @@ typedef u64 cputime64_t;
 
 #define cputime64_zero (0ULL)
 #define cputime64_add(__a, __b)		((__a) + (__b))
+#define cputime64_sub(__a, __b)		((__a) - (__b))
 #define cputime64_to_jiffies64(__ct)	(__ct)
+#define jiffies64_to_cputime64(__jif)	(__jif)
 #define cputime_to_cputime64(__ct)	((u64) __ct)
 
 
-- 
cgit v1.2.3


From 7a6bc1cdd506cf81f856f0fef4e56a2ba0c5a26d Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 28 Jun 2006 13:50:33 -0700
Subject: [CPUFREQ] Add queue_delayed_work_on() interface for workqueues.

Add queue_delayed_work_on() interface for workqueues.

Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 include/linux/workqueue.h |  2 ++
 kernel/workqueue.c        | 38 +++++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 957c21c16d62..9bca3539a1e5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -63,6 +63,8 @@ extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
 extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay));
+extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+	struct work_struct *work, unsigned long delay);
 extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 59f0b42bd89e..8fbef7008a7e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -148,6 +148,27 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 	return ret;
 }
 
+int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+			struct work_struct *work, unsigned long delay)
+{
+	int ret = 0;
+	struct timer_list *timer = &work->timer;
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		BUG_ON(timer_pending(timer));
+		BUG_ON(!list_empty(&work->entry));
+
+		/* This stores wq for the moment, for the timer_fn */
+		work->wq_data = wq;
+		timer->expires = jiffies + delay;
+		timer->data = (unsigned long)work;
+		timer->function = delayed_work_timer_fn;
+		add_timer_on(timer, cpu);
+		ret = 1;
+	}
+	return ret;
+}
+
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	unsigned long flags;
@@ -411,21 +432,7 @@ int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay
 int schedule_delayed_work_on(int cpu,
 			struct work_struct *work, unsigned long delay)
 {
-	int ret = 0;
-	struct timer_list *timer = &work->timer;
-
-	if (!test_and_set_bit(0, &work->pending)) {
-		BUG_ON(timer_pending(timer));
-		BUG_ON(!list_empty(&work->entry));
-		/* This stores keventd_wq for the moment, for the timer_fn */
-		work->wq_data = keventd_wq;
-		timer->expires = jiffies + delay;
-		timer->data = (unsigned long)work;
-		timer->function = delayed_work_timer_fn;
-		add_timer_on(timer, cpu);
-		ret = 1;
-	}
-	return ret;
+	return queue_delayed_work_on(cpu, keventd_wq, work, delay);
 }
 
 /**
@@ -622,6 +629,7 @@ void init_workqueues(void)
 EXPORT_SYMBOL_GPL(__create_workqueue);
 EXPORT_SYMBOL_GPL(queue_work);
 EXPORT_SYMBOL_GPL(queue_delayed_work);
+EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 EXPORT_SYMBOL_GPL(flush_workqueue);
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
-- 
cgit v1.2.3


From 02438d8771ae6a4b215938959827692026380bf9 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 30 Jun 2006 03:19:10 -0400
Subject: ACPI: delete acpi_os_free(), use kfree() directly

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/ia64/kernel/acpi-ext.c       |  2 +-
 arch/ia64/kernel/acpi.c           | 12 ++++++------
 drivers/acpi/acpi_memhotplug.c    |  4 ++--
 drivers/acpi/asus_acpi.c          |  6 +++---
 drivers/acpi/battery.c            |  4 ++--
 drivers/acpi/container.c          |  2 +-
 drivers/acpi/glue.c               |  8 ++++----
 drivers/acpi/namespace/nsxfeval.c |  2 +-
 drivers/acpi/osl.c                |  9 +--------
 drivers/acpi/processor_idle.c     |  2 +-
 drivers/acpi/processor_perflib.c  |  6 +++---
 drivers/acpi/scan.c               |  4 ++--
 drivers/acpi/system.c             |  4 ++--
 drivers/acpi/utilities/utalloc.c  |  4 ++--
 drivers/acpi/utilities/utcache.c  |  2 +-
 drivers/acpi/utils.c              |  4 ++--
 drivers/acpi/video.c              |  2 +-
 include/acpi/acmacros.h           |  2 +-
 include/acpi/acpiosxf.h           |  2 --
 19 files changed, 36 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c
index fff82929d225..ccd016537fe9 100644
--- a/arch/ia64/kernel/acpi-ext.c
+++ b/arch/ia64/kernel/acpi-ext.c
@@ -51,7 +51,7 @@ static acpi_status hp_ccsr_locate(acpi_handle obj, u64 *base, u64 *length)
 	memcpy(length, vendor->byte_data + 8, sizeof(*length));
 
   exit:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 	return status;
 }
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index ca16d9556bde..c92c0aaaf2e6 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -857,7 +857,7 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 	obj = buffer.pointer;
 	if (obj->type != ACPI_TYPE_BUFFER ||
 	    obj->buffer.length < sizeof(*lsapic)) {
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 		return -EINVAL;
 	}
 
@@ -865,13 +865,13 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 
 	if ((lsapic->header.type != ACPI_MADT_LSAPIC) ||
 	    (!lsapic->flags.enabled)) {
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 		return -EINVAL;
 	}
 
 	physid = ((lsapic->id << 8) | (lsapic->eid));
 
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 	buffer.length = ACPI_ALLOCATE_BUFFER;
 	buffer.pointer = NULL;
 
@@ -935,20 +935,20 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 	obj = buffer.pointer;
 	if (obj->type != ACPI_TYPE_BUFFER ||
 	    obj->buffer.length < sizeof(*iosapic)) {
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 		return AE_OK;
 	}
 
 	iosapic = (struct acpi_table_iosapic *)obj->buffer.pointer;
 
 	if (iosapic->header.type != ACPI_MADT_IOSAPIC) {
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 		return AE_OK;
 	}
 
 	gsi_base = iosapic->global_irq_base;
 
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	/*
 	 * OK, it's an IOSAPIC MADT entry, look for a _PXM value to tell
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 6f5e395c78af..2c626e81a289 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -466,7 +466,7 @@ static acpi_status is_memory_device(acpi_handle handle)
 
 	info = buffer.pointer;
 	if (!(info->valid & ACPI_VALID_HID)) {
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 		return AE_ERROR;
 	}
 
@@ -475,7 +475,7 @@ static acpi_status is_memory_device(acpi_handle handle)
 	    (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID)))
 		status = AE_ERROR;
 
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 	return status;
 }
 
diff --git a/drivers/acpi/asus_acpi.c b/drivers/acpi/asus_acpi.c
index 055cfd5c8766..eb0b8fb837c6 100644
--- a/drivers/acpi/asus_acpi.c
+++ b/drivers/acpi/asus_acpi.c
@@ -1017,7 +1017,7 @@ static int asus_hotk_get_info(void)
 		}
 		hotk->methods = &model_conf[hotk->model];
 		
-		acpi_os_free(model);
+		kfree(model);
 
 		return AE_OK;
 	}
@@ -1096,7 +1096,7 @@ static int asus_hotk_get_info(void)
 		/* S1300A reports L84F, but L1400B too, account for that */
 	}
 
-	acpi_os_free(model);
+	kfree(model);
 
 	return AE_OK;
 }
@@ -1256,7 +1256,7 @@ static void __exit asus_acpi_exit(void)
 	acpi_bus_unregister_driver(&asus_hotk_driver);
 	remove_proc_entry(PROC_ASUS, acpi_root_dir);
 
-	acpi_os_free(asus_info);
+	kfree(asus_info);
 
 	return;
 }
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 00b0728efe82..7d92f73b265f 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -171,7 +171,7 @@ acpi_battery_get_info(struct acpi_battery *battery,
 	}
 
       end:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	if (!result)
 		(*bif) = (struct acpi_battery_info *)data.pointer;
@@ -231,7 +231,7 @@ acpi_battery_get_status(struct acpi_battery *battery,
 	}
 
       end:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	if (!result)
 		(*bst) = (struct acpi_battery_status *)data.pointer;
diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c
index 7f7e41d40a3b..871aa520ece7 100644
--- a/drivers/acpi/container.c
+++ b/drivers/acpi/container.c
@@ -236,7 +236,7 @@ container_walk_namespace_cb(acpi_handle handle,
 	}
 
       end:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	return AE_OK;
 }
diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index 8daef57b994c..10f160dc75b1 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -152,7 +152,7 @@ static int get_root_bridge_busnr(acpi_handle handle)
 		bbn = bus;
 	}
       exit:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 	return (int)bbn;
 }
 
@@ -192,7 +192,7 @@ find_pci_rootbridge(acpi_handle handle, u32 lvl, void *context, void **rv)
 		find->handle = handle;
 	status = AE_OK;
       exit:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 	return status;
 }
 
@@ -224,7 +224,7 @@ do_acpi_find_child(acpi_handle handle, u32 lvl, void *context, void **rv)
 		info = buffer.pointer;
 		if (info->address == find->address)
 			find->handle = handle;
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 	}
 	return AE_OK;
 }
@@ -330,7 +330,7 @@ static int acpi_platform_notify(struct device *dev)
 
 		acpi_get_name(dev->firmware_data, ACPI_FULL_PATHNAME, &buffer);
 		DBG("Device %s -> %s\n", dev->bus_id, (char *)buffer.pointer);
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 	} else
 		DBG("Device %s -> No ACPI support\n", dev->bus_id);
 #endif
diff --git a/drivers/acpi/namespace/nsxfeval.c b/drivers/acpi/namespace/nsxfeval.c
index 6d9bd45af30a..dca6799ac678 100644
--- a/drivers/acpi/namespace/nsxfeval.c
+++ b/drivers/acpi/namespace/nsxfeval.c
@@ -133,7 +133,7 @@ acpi_evaluate_object_typed(acpi_handle handle,
 
 		/* Caller used ACPI_ALLOCATE_BUFFER, free the return buffer */
 
-		acpi_os_free(return_buffer->pointer);
+		ACPI_FREE(return_buffer->pointer);
 		return_buffer->pointer = NULL;
 	}
 
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index afd937b158b3..c68b1bb138c3 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -147,13 +147,6 @@ void *acpi_os_allocate(acpi_size size)
 		return kmalloc(size, GFP_KERNEL);
 }
 
-void acpi_os_free(void *ptr)
-{
-	kfree(ptr);
-}
-
-EXPORT_SYMBOL(acpi_os_free);
-
 acpi_status acpi_os_get_root_pointer(u32 flags, struct acpi_pointer *addr)
 {
 	if (efi_enabled) {
@@ -743,7 +736,7 @@ acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 
 	ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Deleting semaphore[%p].\n", handle));
 
-	acpi_os_free(sem);
+	kfree(sem);
 	sem = NULL;
 
 	return AE_OK;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index e439eb77d283..8e9c26aae8fe 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -768,7 +768,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 		status = -EFAULT;
 
       end:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	return status;
 }
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 14a00e5a8f6a..7ba5e49ab302 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -216,7 +216,7 @@ static int acpi_processor_get_performance_control(struct acpi_processor *pr)
 	       sizeof(struct acpi_pct_register));
 
       end:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	return result;
 }
@@ -294,7 +294,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr)
 	}
 
       end:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	return result;
 }
@@ -592,7 +592,7 @@ static int acpi_processor_get_psd(struct acpi_processor	*pr)
 	}
 
 end:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 	return result;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 401e723e6c66..a05b3dfb5cc5 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -319,7 +319,7 @@ static int acpi_bus_get_wakeup_device_flags(struct acpi_device *device)
 		goto end;
 	}
 
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	device->wakeup.flags.valid = 1;
 	/* Power button, Lid switch always enable wakeup */
@@ -854,7 +854,7 @@ static void acpi_device_set_id(struct acpi_device *device,
 			printk(KERN_ERR "Memory allocation error\n");
 	}
 
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 }
 
 static int acpi_device_set_context(struct acpi_device *device, int type)
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index c90bd2f70b3f..c3bb7faad75e 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -86,7 +86,7 @@ acpi_system_read_dsdt(struct file *file,
 
 	res = simple_read_from_buffer(buffer, count, ppos,
 				      dsdt.pointer, dsdt.length);
-	acpi_os_free(dsdt.pointer);
+	kfree(dsdt.pointer);
 
 	return res;
 }
@@ -113,7 +113,7 @@ acpi_system_read_fadt(struct file *file,
 
 	res = simple_read_from_buffer(buffer, count, ppos,
 				      fadt.pointer, fadt.length);
-	acpi_os_free(fadt.pointer);
+	kfree(fadt.pointer);
 
 	return res;
 }
diff --git a/drivers/acpi/utilities/utalloc.c b/drivers/acpi/utilities/utalloc.c
index 7940fc1bd69e..5cff17dc78b3 100644
--- a/drivers/acpi/utilities/utalloc.c
+++ b/drivers/acpi/utilities/utalloc.c
@@ -166,10 +166,10 @@ acpi_status acpi_ut_delete_caches(void)
 
 	/* Free memory lists */
 
-	acpi_os_free(acpi_gbl_global_list);
+	ACPI_FREE(acpi_gbl_global_list);
 	acpi_gbl_global_list = NULL;
 
-	acpi_os_free(acpi_gbl_ns_node_list);
+	ACPI_FREE(acpi_gbl_ns_node_list);
 	acpi_gbl_ns_node_list = NULL;
 #endif
 
diff --git a/drivers/acpi/utilities/utcache.c b/drivers/acpi/utilities/utcache.c
index 56270a30718a..1a1f8109159c 100644
--- a/drivers/acpi/utilities/utcache.c
+++ b/drivers/acpi/utilities/utcache.c
@@ -162,7 +162,7 @@ acpi_status acpi_os_delete_cache(struct acpi_memory_list * cache)
 
 	/* Now we can delete the cache object */
 
-	acpi_os_free(cache);
+	ACPI_FREE(cache);
 	return (AE_OK);
 }
 
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 1930e1a75b22..f48227f4c8c9 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -332,7 +332,7 @@ acpi_evaluate_string(acpi_handle handle,
 
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Return value [%s]\n", *data));
 
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	return AE_OK;
 }
@@ -418,7 +418,7 @@ acpi_evaluate_reference(acpi_handle handle,
 		//kfree(list->handles);
 	}
 
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 
 	return status;
 }
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 9feb633087a9..1f3ffb353299 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -1450,7 +1450,7 @@ static int acpi_video_device_enumerate(struct acpi_video_bus *video)
 	video->attached_array = active_device_list;
 	video->attached_count = count;
       out:
-	acpi_os_free(buffer.pointer);
+	kfree(buffer.pointer);
 	return status;
 }
 
diff --git a/include/acpi/acmacros.h b/include/acpi/acmacros.h
index 4bb38068f40d..f1ac6109556e 100644
--- a/include/acpi/acmacros.h
+++ b/include/acpi/acmacros.h
@@ -726,7 +726,7 @@
 
 #define ACPI_ALLOCATE(a)            acpi_ut_allocate((acpi_size)(a),_COMPONENT,_acpi_module_name,__LINE__)
 #define ACPI_ALLOCATE_ZEROED(a)     acpi_ut_allocate_zeroed((acpi_size)(a), _COMPONENT,_acpi_module_name,__LINE__)
-#define ACPI_FREE(a)                acpi_os_free(a)
+#define ACPI_FREE(a)                kfree(a)
 #define ACPI_MEM_TRACKING(a)
 
 #else
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index 89bc4a16c2e8..0cd63bce0ae4 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -143,8 +143,6 @@ void acpi_os_release_mutex(acpi_mutex handle);
  */
 void *acpi_os_allocate(acpi_size size);
 
-void acpi_os_free(void *memory);
-
 acpi_status
 acpi_os_map_memory(acpi_physical_address physical_address,
 		   acpi_size size, void __iomem ** logical_address);
-- 
cgit v1.2.3


From 947deee8904b3c2edc7f59ab6e6242499e4dc434 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 2 Jul 2006 20:45:51 +0100
Subject: [SERIAL] Convert fifosize to an unsigned int

Some UARTs have more than 255 bytes of FIFO, which can't be
represented by an unsigned char.  Change the kernel's internal
structure to be an unsigned int, but still export an unsigned char
via the TIOCGSERIAL ioctl.  If the TIOCSSERIAL ioctl provides a
fifo size of 0, assume this means "don't change" otherwise we'll
corrupt the larger fifo sizes.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/mpc52xx_uart.c | 3 +--
 drivers/serial/serial_core.c  | 6 ++++--
 include/linux/serial_core.h   | 3 ++-
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index 6459edc7f5c5..1b2cdc91c228 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -728,8 +728,7 @@ mpc52xx_uart_probe(struct platform_device *dev)
 
 	spin_lock_init(&port->lock);
 	port->uartclk	= __res.bi_ipbfreq / 2; /* Look at CTLR doc */
-	port->fifosize	= 255; /* Should be 512 ! But it can't be */
-	                       /* stored in a unsigned char       */
+	port->fifosize	= 512;
 	port->iotype	= UPIO_MEM;
 	port->flags	= UPF_BOOT_AUTOCONF |
 			  ( uart_console(port) ? 0 : UPF_IOREMAP );
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 17839e753e4c..3805f467b2f5 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -691,7 +691,8 @@ static int uart_set_info(struct uart_state *state,
 		    (new_serial.baud_base != port->uartclk / 16) ||
 		    (close_delay != state->close_delay) ||
 		    (closing_wait != state->closing_wait) ||
-		    (new_serial.xmit_fifo_size != port->fifosize) ||
+		    (new_serial.xmit_fifo_size &&
+		     new_serial.xmit_fifo_size != port->fifosize) ||
 		    (((new_flags ^ old_flags) & ~UPF_USR_MASK) != 0))
 			goto exit;
 		port->flags = ((port->flags & ~UPF_USR_MASK) |
@@ -796,7 +797,8 @@ static int uart_set_info(struct uart_state *state,
 	port->custom_divisor   = new_serial.custom_divisor;
 	state->close_delay     = close_delay;
 	state->closing_wait    = closing_wait;
-	port->fifosize         = new_serial.xmit_fifo_size;
+	if (new_serial.xmit_fifo_size)
+		port->fifosize = new_serial.xmit_fifo_size;
 	if (state->info->tty)
 		state->info->tty->low_latency =
 			(port->flags & UPF_LOW_LATENCY) ? 1 : 0;
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index bd14858121ea..966e2e8a1740 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -214,10 +214,11 @@ struct uart_port {
 	unsigned char __iomem	*membase;		/* read/write[bwl] */
 	unsigned int		irq;			/* irq number */
 	unsigned int		uartclk;		/* base uart clock */
-	unsigned char		fifosize;		/* tx fifo size */
+	unsigned int		fifosize;		/* tx fifo size */
 	unsigned char		x_char;			/* xon/xoff char */
 	unsigned char		regshift;		/* reg offset shift */
 	unsigned char		iotype;			/* io access style */
+	unsigned char		unused1;
 
 #define UPIO_PORT		(0)
 #define UPIO_HUB6		(1)
-- 
cgit v1.2.3


From a1af5b2fd49eb24ab8c024da5d853b09841d1f8f Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Thu, 29 Jun 2006 20:28:18 +1000
Subject: [POWERPC] change get_property to return void *

Change the get_property() function to return a void *. This allows us
to later remove the cast done in the majority of callers.

Built for pseries, iseries, pmac32, cell, cbesim, g5, systemsim, maple,
and mpc* defconfigs

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/prom.c | 3 +--
 include/asm-powerpc/prom.h | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 4c524cb52184..ef3619c28702 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -1975,8 +1975,7 @@ struct property *of_find_property(struct device_node *np, const char *name,
  * Find a property with a given name for a given node
  * and return the value.
  */
-unsigned char *get_property(struct device_node *np, const char *name,
-			    int *lenp)
+void *get_property(struct device_node *np, const char *name, int *lenp)
 {
 	struct property *pp = of_find_property(np,name,lenp);
 	return pp ? pp->value : NULL;
diff --git a/include/asm-powerpc/prom.h b/include/asm-powerpc/prom.h
index 010d186d095b..b0768f474809 100644
--- a/include/asm-powerpc/prom.h
+++ b/include/asm-powerpc/prom.h
@@ -167,8 +167,8 @@ extern void unflatten_device_tree(void);
 extern void early_init_devtree(void *);
 extern int device_is_compatible(struct device_node *device, const char *);
 extern int machine_is_compatible(const char *compat);
-extern unsigned char *get_property(struct device_node *node, const char *name,
-				   int *lenp);
+extern void *get_property(struct device_node *node, const char *name,
+		int *lenp);
 extern void print_properties(struct device_node *node);
 extern int prom_n_addr_cells(struct device_node* np);
 extern int prom_n_size_cells(struct device_node* np);
-- 
cgit v1.2.3


From b9e5b4e6a991a5a6d521f2e20a65835404b4169f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 3 Jul 2006 19:32:51 +1000
Subject: [POWERPC] Use the genirq framework

This adapts the generic powerpc interrupt handling code, and all of
the platforms except for the embedded 6xx machines, to use the new
genirq framework.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/irq.c                |  30 +--
 arch/powerpc/kernel/misc_64.S            |  10 +-
 arch/powerpc/platforms/cell/interrupt.c  | 111 ++++----
 arch/powerpc/platforms/cell/interrupt.h  |   4 +-
 arch/powerpc/platforms/cell/setup.c      |  12 +-
 arch/powerpc/platforms/cell/spider-pic.c | 173 ++++++------
 arch/powerpc/platforms/chrp/setup.c      |  17 +-
 arch/powerpc/platforms/iseries/irq.c     |  17 +-
 arch/powerpc/platforms/powermac/pic.c    | 170 +++++++-----
 arch/powerpc/platforms/pseries/setup.c   |  26 +-
 arch/powerpc/platforms/pseries/xics.c    | 443 +++++++++++++++++--------------
 arch/powerpc/platforms/pseries/xics.h    |  17 +-
 arch/powerpc/sysdev/i8259.c              |  30 +--
 arch/powerpc/sysdev/mpic.c               | 193 +++++++-------
 include/asm-powerpc/i8259.h              |   3 -
 include/asm-powerpc/irq.h                |   8 +-
 include/asm-powerpc/mpic.h               |  25 +-
 17 files changed, 673 insertions(+), 616 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 525baab45d2d..91248559099a 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -62,28 +62,27 @@
 #endif
 
 int __irq_offset_value;
-#ifdef CONFIG_PPC32
-EXPORT_SYMBOL(__irq_offset_value);
-#endif
-
 static int ppc_spurious_interrupts;
 
 #ifdef CONFIG_PPC32
-#define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
+EXPORT_SYMBOL(__irq_offset_value);
+atomic_t ppc_n_lost_interrupts;
 
+#ifndef CONFIG_PPC_MERGE
+#define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
 unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
-atomic_t ppc_n_lost_interrupts;
+#endif
 
 #ifdef CONFIG_TAU_INT
 extern int tau_initialized;
 extern int tau_interrupts(int);
 #endif
+#endif /* CONFIG_PPC32 */
 
 #if defined(CONFIG_SMP) && !defined(CONFIG_PPC_MERGE)
 extern atomic_t ipi_recv;
 extern atomic_t ipi_sent;
 #endif
-#endif /* CONFIG_PPC32 */
 
 #ifdef CONFIG_PPC64
 EXPORT_SYMBOL(irq_desc);
@@ -219,15 +218,19 @@ void do_IRQ(struct pt_regs *regs)
 		curtp = current_thread_info();
 		irqtp = hardirq_ctx[smp_processor_id()];
 		if (curtp != irqtp) {
+			struct irq_desc *desc = irq_desc + irq;
+			void *handler = desc->handle_irq;
+			if (handler == NULL)
+				handler = &__do_IRQ;
 			irqtp->task = curtp->task;
 			irqtp->flags = 0;
-			call___do_IRQ(irq, regs, irqtp);
+			call_handle_irq(irq, desc, regs, irqtp, handler);
 			irqtp->task = NULL;
 			if (irqtp->flags)
 				set_bits(irqtp->flags, &curtp->flags);
 		} else
 #endif
-			__do_IRQ(irq, regs);
+			generic_handle_irq(irq, regs);
 	} else if (irq != -2)
 		/* That's not SMP safe ... but who cares ? */
 		ppc_spurious_interrupts++;
@@ -245,15 +248,6 @@ void do_IRQ(struct pt_regs *regs)
 
 void __init init_IRQ(void)
 {
-#ifdef CONFIG_PPC64
-	static int once = 0;
-
-	if (once)
-		return;
-
-	once++;
-
-#endif
 	ppc_md.init_IRQ();
 #ifdef CONFIG_PPC64
 	irq_ctx_init();
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 0c3c70d115c6..bfb407fc1aa1 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -51,12 +51,14 @@ _GLOBAL(call_do_softirq)
 	mtlr	r0
 	blr
 
-_GLOBAL(call___do_IRQ)
+_GLOBAL(call_handle_irq)
+	ld	r8,0(r7)
 	mflr	r0
 	std	r0,16(r1)
-	stdu	r1,THREAD_SIZE-112(r5)
-	mr	r1,r5
-	bl	.__do_IRQ
+	mtctr	r8
+	stdu	r1,THREAD_SIZE-112(r6)
+	mr	r1,r6
+	bctrl
 	ld	r1,0(r1)
 	ld	r0,16(r1)
 	mtlr	r0
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index 22da1335445a..97936f547f19 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -37,64 +37,51 @@
 struct iic {
 	struct cbe_iic_thread_regs __iomem *regs;
 	u8 target_id;
+	u8 eoi_stack[16];
+	int eoi_ptr;
 };
 
 static DEFINE_PER_CPU(struct iic, iic);
 
-void iic_local_enable(void)
+static void iic_mask(unsigned int irq)
 {
-	struct iic *iic = &__get_cpu_var(iic);
-	u64 tmp;
-
-	/*
-	 * There seems to be a bug that is present in DD2.x CPUs
-	 * and still only partially fixed in DD3.1.
-	 * This bug causes a value written to the priority register
-	 * not to make it there, resulting in a system hang unless we
-	 * write it again.
-	 * Masking with 0xf0 is done because the Cell BE does not
-	 * implement the lower four bits of the interrupt priority,
-	 * they always read back as zeroes, although future CPUs
-	 * might implement different bits.
-	 */
-	do {
-		out_be64(&iic->regs->prio, 0xff);
-		tmp = in_be64(&iic->regs->prio);
-	} while ((tmp & 0xf0) != 0xf0);
-}
-
-void iic_local_disable(void)
-{
-	out_be64(&__get_cpu_var(iic).regs->prio, 0x0);
-}
-
-static unsigned int iic_startup(unsigned int irq)
-{
-	return 0;
-}
-
-static void iic_enable(unsigned int irq)
-{
-	iic_local_enable();
 }
 
-static void iic_disable(unsigned int irq)
+static void iic_unmask(unsigned int irq)
 {
 }
 
-static void iic_end(unsigned int irq)
+static void iic_eoi(unsigned int irq)
 {
-	iic_local_enable();
+	struct iic *iic = &__get_cpu_var(iic);
+	out_be64(&iic->regs->prio, iic->eoi_stack[--iic->eoi_ptr]);
+	BUG_ON(iic->eoi_ptr < 0);
 }
 
-static struct hw_interrupt_type iic_pic = {
+static struct irq_chip iic_chip = {
 	.typename = " CELL-IIC ",
-	.startup = iic_startup,
-	.enable = iic_enable,
-	.disable = iic_disable,
-	.end = iic_end,
+	.mask = iic_mask,
+	.unmask = iic_unmask,
+	.eoi = iic_eoi,
 };
 
+/* XXX All of this has to be reworked completely. We need to assign a real
+ * interrupt numbers to the external interrupts and remove all the hard coded
+ * interrupt maps (rely on the device-tree whenever possible).
+ *
+ * Basically, my scheme is to define the "pendings" bits to be the HW interrupt
+ * number (ignoring the data and flags here). That means we can sort-of split
+ * external sources based on priority, and we can use request_irq() on pretty
+ * much anything.
+ *
+ * For spider or axon, they have their own interrupt space. spider will just have
+ * local "hardward" interrupts 0...xx * node stride. The node stride is not
+ * necessary (separate interrupt chips will have separate HW number space), but
+ * will allow to be compatible with existing device-trees.
+ *
+ * All of thise little world will get a standard remapping scheme to map those HW
+ * numbers into the linux flat irq number space.
+*/
 static int iic_external_get_irq(struct cbe_iic_pending_bits pending)
 {
 	int irq;
@@ -118,9 +105,10 @@ static int iic_external_get_irq(struct cbe_iic_pending_bits pending)
 		 */
 		if (pending.class != 2)
 			break;
-		irq = IIC_EXT_OFFSET
-			+ spider_get_irq(node)
-			+ node * IIC_NODE_STRIDE;
+		/* TODO: We might want to silently ignore cascade interrupts
+		 * when no cascade handler exist yet
+		 */
+		irq = IIC_EXT_CASCADE + node * IIC_NODE_STRIDE;
 		break;
 	case 0x01 ... 0x04:
 	case 0x07 ... 0x0a:
@@ -152,6 +140,8 @@ int iic_get_irq(struct pt_regs *regs)
 	iic = &__get_cpu_var(iic);
 	*(unsigned long *) &pending = 
 		in_be64((unsigned long __iomem *) &iic->regs->pending_destr);
+	iic->eoi_stack[++iic->eoi_ptr] = pending.prio;
+	BUG_ON(iic->eoi_ptr > 15);
 
 	irq = -1;
 	if (pending.flags & CBE_IIC_IRQ_VALID) {
@@ -172,7 +162,7 @@ int iic_get_irq(struct pt_regs *regs)
 
 /* hardcoded part to be compatible with older firmware */
 
-static int setup_iic_hardcoded(void)
+static int __init setup_iic_hardcoded(void)
 {
 	struct device_node *np;
 	int nodeid, cpu;
@@ -207,12 +197,13 @@ static int setup_iic_hardcoded(void)
 		printk(KERN_INFO "IIC for CPU %d at %lx\n", cpu, regs);
 		iic->regs = ioremap(regs, sizeof(struct cbe_iic_thread_regs));
 		iic->target_id = (nodeid << 4) + ((cpu & 1) ? 0xf : 0xe);
+		iic->eoi_stack[0] = 0xff;
 	}
 
 	return 0;
 }
 
-static int setup_iic(void)
+static int __init setup_iic(void)
 {
 	struct device_node *dn;
 	unsigned long *regs;
@@ -248,11 +239,14 @@ static int setup_iic(void)
  		iic = &per_cpu(iic, np[0]);
  		iic->regs = ioremap(regs[0], sizeof(struct cbe_iic_thread_regs));
 		iic->target_id = ((np[0] & 2) << 3) + ((np[0] & 1) ? 0xf : 0xe);
+		iic->eoi_stack[0] = 0xff;
  		printk("IIC for CPU %d at %lx mapped to %p\n", np[0], regs[0], iic->regs);
 
  		iic = &per_cpu(iic, np[1]);
  		iic->regs = ioremap(regs[2], sizeof(struct cbe_iic_thread_regs));
 		iic->target_id = ((np[1] & 2) << 3) + ((np[1] & 1) ? 0xf : 0xe);
+		iic->eoi_stack[0] = 0xff;
+
  		printk("IIC for CPU %d at %lx mapped to %p\n", np[1], regs[2], iic->regs);
 
 		found++;
@@ -304,10 +298,10 @@ static void iic_request_ipi(int ipi, const char *name)
 	int irq;
 
 	irq = iic_ipi_to_irq(ipi);
+
 	/* IPIs are marked IRQF_DISABLED as they must run with irqs
 	 * disabled */
-	get_irq_desc(irq)->chip = &iic_pic;
-	get_irq_desc(irq)->status |= IRQ_PER_CPU;
+ 	set_irq_chip_and_handler(irq, &iic_chip, handle_percpu_irq);
 	request_irq(irq, iic_ipi_action, IRQF_DISABLED, name, NULL);
 }
 
@@ -321,20 +315,26 @@ void iic_request_IPIs(void)
 }
 #endif /* CONFIG_SMP */
 
-static void iic_setup_spe_handlers(void)
+static void __init iic_setup_builtin_handlers(void)
 {
 	int be, isrc;
 
-	/* Assume two threads per BE are present */
+	/* XXX FIXME: Assume two threads per BE are present */
 	for (be=0; be < num_present_cpus() / 2; be++) {
+		int irq;
+
+		/* setup SPE chip and handlers */
 		for (isrc = 0; isrc < IIC_CLASS_STRIDE * 3; isrc++) {
-			int irq = IIC_NODE_STRIDE * be + IIC_SPE_OFFSET + isrc;
-			get_irq_desc(irq)->chip = &iic_pic;
+			irq = IIC_NODE_STRIDE * be + IIC_SPE_OFFSET + isrc;
+			set_irq_chip_and_handler(irq, &iic_chip, handle_fasteoi_irq);
 		}
+		/* setup cascade chip */
+		irq = IIC_EXT_CASCADE + be * IIC_NODE_STRIDE;
+		set_irq_chip_and_handler(irq, &iic_chip, handle_fasteoi_irq);
 	}
 }
 
-void iic_init_IRQ(void)
+void __init iic_init_IRQ(void)
 {
 	int cpu, irq_offset;
 	struct iic *iic;
@@ -348,5 +348,6 @@ void iic_init_IRQ(void)
 		if (iic->regs)
 			out_be64(&iic->regs->prio, 0xff);
 	}
-	iic_setup_spe_handlers();
+	iic_setup_builtin_handlers();
+
 }
diff --git a/arch/powerpc/platforms/cell/interrupt.h b/arch/powerpc/platforms/cell/interrupt.h
index 799f77d98f96..c74515aeb630 100644
--- a/arch/powerpc/platforms/cell/interrupt.h
+++ b/arch/powerpc/platforms/cell/interrupt.h
@@ -38,6 +38,7 @@
 
 enum {
 	IIC_EXT_OFFSET   = 0x00, /* Start of south bridge IRQs */
+	IIC_EXT_CASCADE  = 0x20, /* There is no interrupt 32 on spider */
 	IIC_NUM_EXT      = 0x40, /* Number of south bridge IRQs */
 	IIC_SPE_OFFSET   = 0x40, /* Start of SPE interrupts */
 	IIC_CLASS_STRIDE = 0x10, /* SPE IRQs per class    */
@@ -51,13 +52,10 @@ extern int  iic_get_irq(struct pt_regs *regs);
 extern void iic_cause_IPI(int cpu, int mesg);
 extern void iic_request_IPIs(void);
 extern void iic_setup_cpu(void);
-extern void iic_local_enable(void);
-extern void iic_local_disable(void);
 
 extern u8 iic_get_target_id(int cpu);
 
 extern void spider_init_IRQ(void);
-extern int spider_get_irq(int node);
 
 #endif
 #endif /* ASM_CELL_PIC_H */
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index 01ec39431fb0..70a4e903f7e2 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -80,10 +80,14 @@ static void cell_progress(char *s, unsigned short hex)
 	printk("*** %04x : %s\n", hex, s ? s : "");
 }
 
+static void __init cell_init_irq(void)
+{
+	iic_init_IRQ();
+	spider_init_IRQ();
+}
+
 static void __init cell_setup_arch(void)
 {
-	ppc_md.init_IRQ       = iic_init_IRQ;
-	ppc_md.get_irq        = iic_get_irq;
 #ifdef CONFIG_SPU_BASE
 	spu_priv1_ops         = &spu_priv1_mmio_ops;
 #endif
@@ -109,7 +113,6 @@ static void __init cell_setup_arch(void)
 	/* Find and initialize PCI host bridges */
 	init_pci_config_tokens();
 	find_and_init_phbs();
-	spider_init_IRQ();
 	cbe_pervasive_init();
 #ifdef CONFIG_DUMMY_CONSOLE
 	conswitchp = &dummy_con;
@@ -174,6 +177,9 @@ define_machine(cell) {
 	.calibrate_decr		= generic_calibrate_decr,
 	.check_legacy_ioport	= cell_check_legacy_ioport,
 	.progress		= cell_progress,
+	.init_IRQ       	= cell_init_irq,
+	.get_irq        	= iic_get_irq,
+
 #ifdef CONFIG_KEXEC
 	.machine_kexec		= default_machine_kexec,
 	.machine_kexec_prepare	= default_machine_kexec_prepare,
diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c
index 7c3a0b6d34fd..98425acb6cda 100644
--- a/arch/powerpc/platforms/cell/spider-pic.c
+++ b/arch/powerpc/platforms/cell/spider-pic.c
@@ -82,17 +82,20 @@ static void __iomem *spider_get_irq_config(int irq)
 	return pic + TIR_CFGA + 8 * spider_get_nr(irq);
 }
 
-static void spider_enable_irq(unsigned int irq)
+static void spider_unmask_irq(unsigned int irq)
 {
 	int nodeid = (irq / IIC_NODE_STRIDE) * 0x10;
 	void __iomem *cfg = spider_get_irq_config(irq);
 	irq = spider_get_nr(irq);
 
+	/* FIXME: Most of that is configuration and has nothing to do with enabling/disable,
+	 * besides, it's also partially bogus.
+	 */
 	out_be32(cfg, (in_be32(cfg) & ~0xf0)| 0x3107000eu | nodeid);
 	out_be32(cfg + 4, in_be32(cfg + 4) | 0x00020000u | irq);
 }
 
-static void spider_disable_irq(unsigned int irq)
+static void spider_mask_irq(unsigned int irq)
 {
 	void __iomem *cfg = spider_get_irq_config(irq);
 	irq = spider_get_nr(irq);
@@ -100,39 +103,21 @@ static void spider_disable_irq(unsigned int irq)
 	out_be32(cfg, in_be32(cfg) & ~0x30000000u);
 }
 
-static unsigned int spider_startup_irq(unsigned int irq)
-{
-	spider_enable_irq(irq);
-	return 0;
-}
-
-static void spider_shutdown_irq(unsigned int irq)
-{
-	spider_disable_irq(irq);
-}
-
-static void spider_end_irq(unsigned int irq)
-{
-	spider_enable_irq(irq);
-}
-
 static void spider_ack_irq(unsigned int irq)
 {
-	spider_disable_irq(irq);
-	iic_local_enable();
+	/* Should reset edge detection logic but we don't configure any edge interrupt
+	 * at the moment.
+	 */
 }
 
-static struct hw_interrupt_type spider_pic = {
+static struct irq_chip spider_pic = {
 	.typename = " SPIDER   ",
-	.startup = spider_startup_irq,
-	.shutdown = spider_shutdown_irq,
-	.enable = spider_enable_irq,
-	.disable = spider_disable_irq,
+	.unmask = spider_unmask_irq,
+	.mask = spider_mask_irq,
 	.ack = spider_ack_irq,
-	.end = spider_end_irq,
 };
 
-int spider_get_irq(int node)
+static int spider_get_irq(int node)
 {
 	unsigned long cs;
 	void __iomem *regs = spider_pics[node];
@@ -145,95 +130,89 @@ int spider_get_irq(int node)
 		return cs;
 }
 
+static void spider_irq_cascade(unsigned int irq, struct irq_desc *desc,
+			       struct pt_regs *regs)
+{
+	int node = (int)(long)desc->handler_data;
+	int cascade_irq;
+
+	cascade_irq = spider_get_irq(node);
+	generic_handle_irq(cascade_irq, regs);
+	desc->chip->eoi(irq);
+}
+
 /* hardcoded part to be compatible with older firmware */
 
-void spider_init_IRQ_hardcoded(void)
+static void __init spider_init_one(int node, unsigned long addr)
 {
-	int node;
-	long spiderpic;
-	long pics[] = { 0x24000008000, 0x34000008000 };
-	int n;
-
-	pr_debug("%s(%d): Using hardcoded defaults\n", __FUNCTION__, __LINE__);
-
-	for (node = 0; node < num_present_cpus()/2; node++) {
-		spiderpic = pics[node];
-		printk(KERN_DEBUG "SPIDER addr: %lx\n", spiderpic);
-		spider_pics[node] = ioremap(spiderpic, 0x800);
-		for (n = 0; n < IIC_NUM_EXT; n++) {
-			int irq = n + IIC_EXT_OFFSET + node * IIC_NODE_STRIDE;
-			get_irq_desc(irq)->chip = &spider_pic;
-		}
-
- 		/* do not mask any interrupts because of level */
- 		out_be32(spider_pics[node] + TIR_MSK, 0x0);
-
- 		/* disable edge detection clear */
- 		/* out_be32(spider_pics[node] + TIR_EDC, 0x0); */
-
- 		/* enable interrupt packets to be output */
- 		out_be32(spider_pics[node] + TIR_PIEN,
-			in_be32(spider_pics[node] + TIR_PIEN) | 0x1);
-
- 		/* Enable the interrupt detection enable bit. Do this last! */
- 		out_be32(spider_pics[node] + TIR_DEN,
-			in_be32(spider_pics[node] + TIR_DEN) | 0x1);
+	int n, irq;
+
+	spider_pics[node] = ioremap(addr, 0x800);
+	if (spider_pics[node] == NULL)
+		panic("spider_pic: can't map registers !");
+
+	printk(KERN_INFO "spider_pic: mapped for node %d, addr: 0x%lx mapped to %p\n",
+	       node, addr, spider_pics[node]);
+
+	for (n = 0; n < IIC_NUM_EXT; n++) {
+		if (n == IIC_EXT_CASCADE)
+			continue;
+		irq = n + IIC_EXT_OFFSET + node * IIC_NODE_STRIDE;
+		set_irq_chip_and_handler(irq, &spider_pic, handle_level_irq);
+		get_irq_desc(irq)->status |= IRQ_LEVEL;
 	}
+
+	/* do not mask any interrupts because of level */
+	out_be32(spider_pics[node] + TIR_MSK, 0x0);
+
+	/* disable edge detection clear */
+	/* out_be32(spider_pics[node] + TIR_EDC, 0x0); */
+
+	/* enable interrupt packets to be output */
+	out_be32(spider_pics[node] + TIR_PIEN,
+		 in_be32(spider_pics[node] + TIR_PIEN) | 0x1);
+
+	/* Hook up cascade */
+	irq = IIC_EXT_CASCADE + node * IIC_NODE_STRIDE;
+	set_irq_data(irq, (void *)(long)node);
+	set_irq_chained_handler(irq, spider_irq_cascade);
+
+	/* Enable the interrupt detection enable bit. Do this last! */
+	out_be32(spider_pics[node] + TIR_DEN,
+		 in_be32(spider_pics[node] + TIR_DEN) | 0x1);
 }
 
-void spider_init_IRQ(void)
+void __init spider_init_IRQ(void)
 {
-	long spider_reg;
+	unsigned long *spider_reg;
 	struct device_node *dn;
 	char *compatible;
-	int n, node = 0;
-
+	int node = 0;
+
+	/* XXX node numbers are totally bogus. We _hope_ we get the device nodes in the right
+	 * order here but that's definitely not guaranteed, we need to get the node from the
+	 * device tree instead. There is currently no proper property for it (but our whole
+	 * device-tree is bogus anyway) so all we can do is pray or maybe test the address
+	 * and deduce the node-id
+	 */
 	for (dn = NULL; (dn = of_find_node_by_name(dn, "interrupt-controller"));) {
 		compatible = (char *)get_property(dn, "compatible", NULL);
 
 		if (!compatible)
 			continue;
 
-		if (strstr(compatible, "CBEA,platform-spider-pic"))
-			spider_reg = *(long *)get_property(dn,"reg", NULL);
-		else if (strstr(compatible, "sti,platform-spider-pic")) {
-			spider_init_IRQ_hardcoded();
-			return;
+ 		if (strstr(compatible, "CBEA,platform-spider-pic"))
+			spider_reg = (unsigned long *)get_property(dn, "reg", NULL);
+		else if (strstr(compatible, "sti,platform-spider-pic") && (node < 2)) {
+			static long hard_coded_pics[] = { 0x24000008000, 0x34000008000 };
+			spider_reg = &hard_coded_pics[node];
 		} else
 			continue;
 
-		if (!spider_reg)
-			printk("interrupt controller does not have reg property !\n");
-
-		n = prom_n_addr_cells(dn);
-
-		if ( n != 2)
-			printk("reg property with invalid number of elements \n");
-
-		spider_pics[node] = ioremap(spider_reg, 0x800);
-
-		printk("SPIDER addr: %lx with %i addr_cells mapped to %p\n",
-		       spider_reg, n, spider_pics[node]);
-
-		for (n = 0; n < IIC_NUM_EXT; n++) {
-			int irq = n + IIC_EXT_OFFSET + node * IIC_NODE_STRIDE;
-			get_irq_desc(irq)->chip = &spider_pic;
-		}
-
-		/* do not mask any interrupts because of level */
-		out_be32(spider_pics[node] + TIR_MSK, 0x0);
-
-		/* disable edge detection clear */
-		/* out_be32(spider_pics[node] + TIR_EDC, 0x0); */
-
-		/* enable interrupt packets to be output */
-		out_be32(spider_pics[node] + TIR_PIEN,
-			in_be32(spider_pics[node] + TIR_PIEN) | 0x1);
-
-		/* Enable the interrupt detection enable bit. Do this last! */
-		out_be32(spider_pics[node] + TIR_DEN,
-			in_be32(spider_pics[node] + TIR_DEN) | 0x1);
+		if (spider_reg == NULL)
+			printk(KERN_ERR "spider_pic: No address for node %d\n", node);
 
+		spider_init_one(node, *spider_reg);
 		node++;
 	}
 }
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c
index 1f1771b212b4..a5dffc8d93a9 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -315,6 +315,21 @@ chrp_event_scan(unsigned long unused)
 		  jiffies + event_scan_interval);
 }
 
+void chrp_8259_cascade(unsigned int irq, struct irq_desc *desc,
+			struct pt_regs *regs)
+{
+	unsigned int max = 100;
+
+	while(max--) {
+		int irq = i8259_irq(regs);
+		if (max == 99)
+			desc->chip->eoi(irq);
+		if (irq < 0)
+			break;
+		generic_handle_irq(irq, regs);
+	};
+}
+
 /*
  * Finds the open-pic node and sets up the mpic driver.
  */
@@ -402,7 +417,7 @@ static void __init chrp_find_openpic(void)
 	}
 
 	mpic_init(chrp_mpic);
-	mpic_setup_cascade(NUM_ISA_INTERRUPTS, i8259_irq_cascade, NULL);
+	set_irq_chained_handler(NUM_ISA_INTERRUPTS, chrp_8259_cascade);
 }
 
 #if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(XMON)
diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c
index f70e820e7304..7fb6a08786c0 100644
--- a/arch/powerpc/platforms/iseries/irq.c
+++ b/arch/powerpc/platforms/iseries/irq.c
@@ -297,13 +297,13 @@ static void iseries_end_IRQ(unsigned int irq)
 		(REAL_IRQ_TO_IDSEL(rirq) << 4) + REAL_IRQ_TO_FUNC(rirq));
 }
 
-static hw_irq_controller iSeries_IRQ_handler = {
-	.typename = "iSeries irq controller",
-	.startup = iseries_startup_IRQ,
-	.shutdown = iseries_shutdown_IRQ,
-	.enable = iseries_enable_IRQ,
-	.disable = iseries_disable_IRQ,
-	.end = iseries_end_IRQ
+static struct irq_chip iseries_pic = {
+	.typename	= "iSeries irq controller",
+	.startup	= iseries_startup_IRQ,
+	.shutdown	= iseries_shutdown_IRQ,
+	.unmask		= iseries_enable_IRQ,
+	.mask		= iseries_disable_IRQ,
+	.eoi		= iseries_end_IRQ
 };
 
 /*
@@ -322,8 +322,7 @@ int __init iSeries_allocate_IRQ(HvBusNumber bus,
 	realirq = (((((sub_bus << 8) + (bus - 1)) << 3) + (idsel - 1)) << 3)
 		+ function;
 	virtirq = virt_irq_create_mapping(realirq);
-
-	irq_desc[virtirq].chip = &iSeries_IRQ_handler;
+	set_irq_chip_and_handler(virtirq, &iseries_pic, handle_fasteoi_irq);
 	return virtirq;
 }
 
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index c9b09a9e6050..58a4c7b90b8b 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -70,18 +70,19 @@ static struct interrupt_info gatwick_int_pool[GATWICK_IRQ_POOL_SIZE];
 
 #define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
 static unsigned long ppc_lost_interrupts[NR_MASK_WORDS];
+static unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
+static int pmac_irq_cascade = -1;
 
-/*
- * Mark an irq as "lost".  This is only used on the pmac
- * since it can lose interrupts (see pmac_set_irq_mask).
- * -- Cort
- */
-void __set_lost(unsigned long irq_nr, int nokick)
+static void __pmac_retrigger(unsigned int irq_nr)
 {
-	if (!test_and_set_bit(irq_nr, ppc_lost_interrupts)) {
+	if (irq_nr >= max_real_irqs && pmac_irq_cascade > 0) {
+		__set_bit(irq_nr, ppc_lost_interrupts);
+		irq_nr = pmac_irq_cascade;
+		mb();
+	}
+	if (!__test_and_set_bit(irq_nr, ppc_lost_interrupts)) {
 		atomic_inc(&ppc_n_lost_interrupts);
-		if (!nokick)
-			set_dec(1);
+		set_dec(1);
 	}
 }
 
@@ -94,10 +95,10 @@ static void pmac_mask_and_ack_irq(unsigned int irq_nr)
         if ((unsigned)irq_nr >= max_irqs)
                 return;
 
-        clear_bit(irq_nr, ppc_cached_irq_mask);
-        if (test_and_clear_bit(irq_nr, ppc_lost_interrupts))
-                atomic_dec(&ppc_n_lost_interrupts);
 	spin_lock_irqsave(&pmac_pic_lock, flags);
+        __clear_bit(irq_nr, ppc_cached_irq_mask);
+        if (__test_and_clear_bit(irq_nr, ppc_lost_interrupts))
+                atomic_dec(&ppc_n_lost_interrupts);
         out_le32(&pmac_irq_hw[i]->enable, ppc_cached_irq_mask[i]);
         out_le32(&pmac_irq_hw[i]->ack, bit);
         do {
@@ -109,7 +110,7 @@ static void pmac_mask_and_ack_irq(unsigned int irq_nr)
 	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 }
 
-static void pmac_set_irq_mask(unsigned int irq_nr, int nokicklost)
+static void pmac_ack_irq(unsigned int irq_nr)
 {
         unsigned long bit = 1UL << (irq_nr & 0x1f);
         int i = irq_nr >> 5;
@@ -118,7 +119,22 @@ static void pmac_set_irq_mask(unsigned int irq_nr, int nokicklost)
         if ((unsigned)irq_nr >= max_irqs)
                 return;
 
-	spin_lock_irqsave(&pmac_pic_lock, flags);
+  	spin_lock_irqsave(&pmac_pic_lock, flags);
+	if (__test_and_clear_bit(irq_nr, ppc_lost_interrupts))
+                atomic_dec(&ppc_n_lost_interrupts);
+        out_le32(&pmac_irq_hw[i]->ack, bit);
+        (void)in_le32(&pmac_irq_hw[i]->ack);
+	spin_unlock_irqrestore(&pmac_pic_lock, flags);
+}
+
+static void __pmac_set_irq_mask(unsigned int irq_nr, int nokicklost)
+{
+        unsigned long bit = 1UL << (irq_nr & 0x1f);
+        int i = irq_nr >> 5;
+
+        if ((unsigned)irq_nr >= max_irqs)
+                return;
+
         /* enable unmasked interrupts */
         out_le32(&pmac_irq_hw[i]->enable, ppc_cached_irq_mask[i]);
 
@@ -135,8 +151,7 @@ static void pmac_set_irq_mask(unsigned int irq_nr, int nokicklost)
          * the bit in the flag register or request another interrupt.
          */
         if (bit & ppc_cached_irq_mask[i] & in_le32(&pmac_irq_hw[i]->level))
-		__set_lost((ulong)irq_nr, nokicklost);
-	spin_unlock_irqrestore(&pmac_pic_lock, flags);
+		__pmac_retrigger(irq_nr);
 }
 
 /* When an irq gets requested for the first client, if it's an
@@ -144,62 +159,67 @@ static void pmac_set_irq_mask(unsigned int irq_nr, int nokicklost)
  */
 static unsigned int pmac_startup_irq(unsigned int irq_nr)
 {
+	unsigned long flags;
         unsigned long bit = 1UL << (irq_nr & 0x1f);
         int i = irq_nr >> 5;
 
+  	spin_lock_irqsave(&pmac_pic_lock, flags);
 	if ((irq_desc[irq_nr].status & IRQ_LEVEL) == 0)
 		out_le32(&pmac_irq_hw[i]->ack, bit);
-        set_bit(irq_nr, ppc_cached_irq_mask);
-        pmac_set_irq_mask(irq_nr, 0);
+        __set_bit(irq_nr, ppc_cached_irq_mask);
+        __pmac_set_irq_mask(irq_nr, 0);
+  	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 
 	return 0;
 }
 
 static void pmac_mask_irq(unsigned int irq_nr)
 {
-        clear_bit(irq_nr, ppc_cached_irq_mask);
-        pmac_set_irq_mask(irq_nr, 0);
-        mb();
+	unsigned long flags;
+
+  	spin_lock_irqsave(&pmac_pic_lock, flags);
+        __clear_bit(irq_nr, ppc_cached_irq_mask);
+        __pmac_set_irq_mask(irq_nr, 0);
+  	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 }
 
 static void pmac_unmask_irq(unsigned int irq_nr)
 {
-        set_bit(irq_nr, ppc_cached_irq_mask);
-        pmac_set_irq_mask(irq_nr, 0);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmac_pic_lock, flags);
+	__set_bit(irq_nr, ppc_cached_irq_mask);
+        __pmac_set_irq_mask(irq_nr, 0);
+  	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 }
 
-static void pmac_end_irq(unsigned int irq_nr)
+static int pmac_retrigger(unsigned int irq_nr)
 {
-	if (!(irq_desc[irq_nr].status & (IRQ_DISABLED|IRQ_INPROGRESS))
-	    && irq_desc[irq_nr].action) {
-        	set_bit(irq_nr, ppc_cached_irq_mask);
-	        pmac_set_irq_mask(irq_nr, 1);
-	}
-}
+	unsigned long flags;
 
+  	spin_lock_irqsave(&pmac_pic_lock, flags);
+	__pmac_retrigger(irq_nr);
+  	spin_unlock_irqrestore(&pmac_pic_lock, flags);
+	return 1;
+}
 
-struct hw_interrupt_type pmac_pic = {
+static struct irq_chip pmac_pic = {
 	.typename	= " PMAC-PIC ",
 	.startup	= pmac_startup_irq,
-	.enable		= pmac_unmask_irq,
-	.disable	= pmac_mask_irq,
-	.ack		= pmac_mask_and_ack_irq,
-	.end		= pmac_end_irq,
-};
-
-struct hw_interrupt_type gatwick_pic = {
-	.typename	= " GATWICK  ",
-	.startup	= pmac_startup_irq,
-	.enable		= pmac_unmask_irq,
-	.disable	= pmac_mask_irq,
-	.ack		= pmac_mask_and_ack_irq,
-	.end		= pmac_end_irq,
+	.mask		= pmac_mask_irq,
+	.ack		= pmac_ack_irq,
+	.mask_ack	= pmac_mask_and_ack_irq,
+	.unmask		= pmac_unmask_irq,
+	.retrigger	= pmac_retrigger,
 };
 
 static irqreturn_t gatwick_action(int cpl, void *dev_id, struct pt_regs *regs)
 {
+	unsigned long flags;
 	int irq, bits;
+	int rc = IRQ_NONE;
 
+  	spin_lock_irqsave(&pmac_pic_lock, flags);
 	for (irq = max_irqs; (irq -= 32) >= max_real_irqs; ) {
 		int i = irq >> 5;
 		bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i];
@@ -209,17 +229,20 @@ static irqreturn_t gatwick_action(int cpl, void *dev_id, struct pt_regs *regs)
 		if (bits == 0)
 			continue;
 		irq += __ilog2(bits);
+		spin_unlock_irqrestore(&pmac_pic_lock, flags);
 		__do_IRQ(irq, regs);
-		return IRQ_HANDLED;
+		spin_lock_irqsave(&pmac_pic_lock, flags);
+		rc = IRQ_HANDLED;
 	}
-	printk("gatwick irq not from gatwick pic\n");
-	return IRQ_NONE;
+  	spin_unlock_irqrestore(&pmac_pic_lock, flags);
+	return rc;
 }
 
 static int pmac_get_irq(struct pt_regs *regs)
 {
 	int irq;
 	unsigned long bits = 0;
+	unsigned long flags;
 
 #ifdef CONFIG_SMP
 	void psurge_smp_message_recv(struct pt_regs *);
@@ -230,6 +253,7 @@ static int pmac_get_irq(struct pt_regs *regs)
 		return -2;	/* ignore, already handled */
         }
 #endif /* CONFIG_SMP */
+  	spin_lock_irqsave(&pmac_pic_lock, flags);
 	for (irq = max_real_irqs; (irq -= 32) >= 0; ) {
 		int i = irq >> 5;
 		bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i];
@@ -241,6 +265,7 @@ static int pmac_get_irq(struct pt_regs *regs)
 		irq += __ilog2(bits);
 		break;
 	}
+  	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 
 	return irq;
 }
@@ -389,7 +414,6 @@ static struct irqaction gatwick_cascade_action = {
 static void __init pmac_pic_probe_oldstyle(void)
 {
         int i;
-	int irq_cascade = -1;
         struct device_node *master = NULL;
 	struct device_node *slave = NULL;
 	u8 __iomem *addr;
@@ -443,9 +467,16 @@ static void __init pmac_pic_probe_oldstyle(void)
 	}
 	BUG_ON(master == NULL);
 
-	/* Set the handler for the main PIC */
-	for ( i = 0; i < max_real_irqs ; i++ )
-		irq_desc[i].chip = &pmac_pic;
+	/* Mark level interrupts and set handlers */
+	for (i = 0; i < max_irqs; i++) {
+		int level = !!(level_mask[i >> 5] & (1UL << (i & 0x1f)));
+		if (level)
+			irq_desc[i].status |= IRQ_LEVEL;
+		else
+			irq_desc[i].status |= IRQ_DELAYED_DISABLE;
+		set_irq_chip_and_handler(i, &pmac_pic, level ?
+					 handle_level_irq : handle_edge_irq);
+	}
 
 	/* Get addresses of first controller if we have a node for it */
 	BUG_ON(of_address_to_resource(master, 0, &r));
@@ -472,29 +503,22 @@ static void __init pmac_pic_probe_oldstyle(void)
 			pmac_irq_hw[i++] =
 				(volatile struct pmac_irq_hw __iomem *)
 				(addr + 0x10);
-		irq_cascade = slave->intrs[0].line;
+		pmac_irq_cascade = slave->intrs[0].line;
 
 		printk(KERN_INFO "irq: Found slave Apple PIC %s for %d irqs"
 		       " cascade: %d\n", slave->full_name,
-		       max_irqs - max_real_irqs, irq_cascade);
+		       max_irqs - max_real_irqs, pmac_irq_cascade);
 	}
 	of_node_put(slave);
 
-	/* disable all interrupts in all controllers */
+	/* Disable all interrupts in all controllers */
 	for (i = 0; i * 32 < max_irqs; ++i)
 		out_le32(&pmac_irq_hw[i]->enable, 0);
 
-	/* mark level interrupts */
-	for (i = 0; i < max_irqs; i++)
-		if (level_mask[i >> 5] & (1UL << (i & 0x1f)))
-			irq_desc[i].status = IRQ_LEVEL;
+	/* Hookup cascade irq */
+	if (slave)
+		setup_irq(pmac_irq_cascade, &gatwick_cascade_action);
 
-	/* Setup handlers for secondary controller and hook cascade irq*/
-	if (slave) {
-		for ( i = max_real_irqs ; i < max_irqs ; i++ )
-			irq_desc[i].chip = &gatwick_pic;
-		setup_irq(irq_cascade, &gatwick_cascade_action);
-	}
 	printk(KERN_INFO "irq: System has %d possible interrupts\n", max_irqs);
 #ifdef CONFIG_XMON
 	setup_irq(20, &xmon_action);
@@ -502,9 +526,20 @@ static void __init pmac_pic_probe_oldstyle(void)
 }
 #endif /* CONFIG_PPC32 */
 
-static int pmac_u3_cascade(struct pt_regs *regs, void *data)
+static void pmac_u3_cascade(unsigned int irq, struct irq_desc *desc,
+			    struct pt_regs *regs)
 {
-	return mpic_get_one_irq((struct mpic *)data, regs);
+	struct mpic *mpic = desc->handler_data;
+	unsigned int max = 100;
+
+	while(max--) {
+		int cascade_irq = mpic_get_one_irq(mpic, regs);
+		if (max == 99)
+			desc->chip->eoi(irq);
+		if (irq < 0)
+			break;
+		generic_handle_irq(cascade_irq, regs);
+	};
 }
 
 static void __init pmac_pic_setup_mpic_nmi(struct mpic *mpic)
@@ -612,7 +647,8 @@ static int __init pmac_pic_probe_mpic(void)
 		of_node_put(slave);
 		return 0;
 	}
-	mpic_setup_cascade(slave->intrs[0].line, pmac_u3_cascade, mpic2);
+	set_irq_data(slave->intrs[0].line, mpic2);
+	set_irq_chained_handler(slave->intrs[0].line, pmac_u3_cascade);
 
 	of_node_put(slave);
 	return 0;
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 999509d28af8..476b564a208b 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -118,6 +118,21 @@ static void __init fwnmi_init(void)
 		fwnmi_active = 1;
 }
 
+void pSeries_8259_cascade(unsigned int irq, struct irq_desc *desc,
+			  struct pt_regs *regs)
+{
+	unsigned int max = 100;
+
+	while(max--) {
+		int cascade_irq = i8259_irq(regs);
+		if (max == 99)
+			desc->chip->eoi(irq);
+		if (cascade_irq < 0)
+			break;
+		generic_handle_irq(cascade_irq, regs);
+	};
+}
+
 static void __init pSeries_init_mpic(void)
 {
         unsigned int *addrp;
@@ -140,7 +155,7 @@ static void __init pSeries_init_mpic(void)
 	i8259_init(intack, 0);
 
 	/* Hook cascade to mpic */
-	mpic_setup_cascade(NUM_ISA_INTERRUPTS, i8259_irq_cascade, NULL);
+	set_irq_chained_handler(NUM_ISA_INTERRUPTS, pSeries_8259_cascade);
 }
 
 static void __init pSeries_setup_mpic(void)
@@ -201,10 +216,8 @@ static void __init pSeries_setup_arch(void)
 		/* Allocate the mpic now, so that find_and_init_phbs() can
 		 * fill the ISUs */
 		pSeries_setup_mpic();
-	} else {
+	} else
 		ppc_md.init_IRQ       = xics_init_IRQ;
-		ppc_md.get_irq        = xics_get_irq;
-	}
 
 #ifdef CONFIG_SMP
 	smp_init_pSeries();
@@ -291,10 +304,7 @@ static void pSeries_mach_cpu_die(void)
 {
 	local_irq_disable();
 	idle_task_exit();
-	/* Some hardware requires clearing the CPPR, while other hardware does not
-	 * it is safe either way
-	 */
-	pSeriesLP_cppr_info(0, 0);
+	xics_teardown_cpu(0);
 	rtas_stop_self();
 	/* Should never get here... */
 	BUG();
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 2ffebe31cb2d..c7f04420066d 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -31,23 +31,6 @@
 
 #include "xics.h"
 
-static unsigned int xics_startup(unsigned int irq);
-static void xics_enable_irq(unsigned int irq);
-static void xics_disable_irq(unsigned int irq);
-static void xics_mask_and_ack_irq(unsigned int irq);
-static void xics_end_irq(unsigned int irq);
-static void xics_set_affinity(unsigned int irq_nr, cpumask_t cpumask);
-
-static struct hw_interrupt_type xics_pic = {
-	.typename = " XICS     ",
-	.startup = xics_startup,
-	.enable = xics_enable_irq,
-	.disable = xics_disable_irq,
-	.ack = xics_mask_and_ack_irq,
-	.end = xics_end_irq,
-	.set_affinity = xics_set_affinity
-};
-
 /* This is used to map real irq numbers to virtual */
 static struct radix_tree_root irq_map = RADIX_TREE_INIT(GFP_ATOMIC);
 
@@ -98,47 +81,33 @@ static int ibm_set_xive;
 static int ibm_int_on;
 static int ibm_int_off;
 
-typedef struct {
-	int (*xirr_info_get)(int cpu);
-	void (*xirr_info_set)(int cpu, int val);
-	void (*cppr_info)(int cpu, u8 val);
-	void (*qirr_info)(int cpu, u8 val);
-} xics_ops;
 
+/* Direct HW low level accessors */
 
-/* SMP */
 
-static int pSeries_xirr_info_get(int n_cpu)
+static inline int direct_xirr_info_get(int n_cpu)
 {
 	return in_be32(&xics_per_cpu[n_cpu]->xirr.word);
 }
 
-static void pSeries_xirr_info_set(int n_cpu, int value)
+static inline void direct_xirr_info_set(int n_cpu, int value)
 {
 	out_be32(&xics_per_cpu[n_cpu]->xirr.word, value);
 }
 
-static void pSeries_cppr_info(int n_cpu, u8 value)
+static inline void direct_cppr_info(int n_cpu, u8 value)
 {
 	out_8(&xics_per_cpu[n_cpu]->xirr.bytes[0], value);
 }
 
-static void pSeries_qirr_info(int n_cpu, u8 value)
+static inline void direct_qirr_info(int n_cpu, u8 value)
 {
 	out_8(&xics_per_cpu[n_cpu]->qirr.bytes[0], value);
 }
 
-static xics_ops pSeries_ops = {
-	pSeries_xirr_info_get,
-	pSeries_xirr_info_set,
-	pSeries_cppr_info,
-	pSeries_qirr_info
-};
-
-static xics_ops *ops = &pSeries_ops;
 
+/* LPAR low level accessors */
 
-/* LPAR */
 
 static inline long plpar_eoi(unsigned long xirr)
 {
@@ -161,7 +130,7 @@ static inline long plpar_xirr(unsigned long *xirr_ret)
 	return plpar_hcall(H_XIRR, 0, 0, 0, 0, xirr_ret, &dummy, &dummy);
 }
 
-static int pSeriesLP_xirr_info_get(int n_cpu)
+static inline int lpar_xirr_info_get(int n_cpu)
 {
 	unsigned long lpar_rc;
 	unsigned long return_value;
@@ -172,7 +141,7 @@ static int pSeriesLP_xirr_info_get(int n_cpu)
 	return (int)return_value;
 }
 
-static void pSeriesLP_xirr_info_set(int n_cpu, int value)
+static inline void lpar_xirr_info_set(int n_cpu, int value)
 {
 	unsigned long lpar_rc;
 	unsigned long val64 = value & 0xffffffff;
@@ -183,7 +152,7 @@ static void pSeriesLP_xirr_info_set(int n_cpu, int value)
 		      val64);
 }
 
-void pSeriesLP_cppr_info(int n_cpu, u8 value)
+static inline void lpar_cppr_info(int n_cpu, u8 value)
 {
 	unsigned long lpar_rc;
 
@@ -192,7 +161,7 @@ void pSeriesLP_cppr_info(int n_cpu, u8 value)
 		panic("bad return code cppr - rc = %lx\n", lpar_rc);
 }
 
-static void pSeriesLP_qirr_info(int n_cpu , u8 value)
+static inline void lpar_qirr_info(int n_cpu , u8 value)
 {
 	unsigned long lpar_rc;
 
@@ -201,36 +170,9 @@ static void pSeriesLP_qirr_info(int n_cpu , u8 value)
 		panic("bad return code qirr - rc = %lx\n", lpar_rc);
 }
 
-xics_ops pSeriesLP_ops = {
-	pSeriesLP_xirr_info_get,
-	pSeriesLP_xirr_info_set,
-	pSeriesLP_cppr_info,
-	pSeriesLP_qirr_info
-};
 
-static unsigned int xics_startup(unsigned int virq)
-{
-	unsigned int irq;
+/* High level handlers and init code */
 
-	irq = irq_offset_down(virq);
-	if (radix_tree_insert(&irq_map, virt_irq_to_real(irq),
-			      &virt_irq_to_real_map[irq]) == -ENOMEM)
-		printk(KERN_CRIT "Out of memory creating real -> virtual"
-		       " IRQ mapping for irq %u (real 0x%x)\n",
-		       virq, virt_irq_to_real(irq));
-	xics_enable_irq(virq);
-	return 0;	/* return value is ignored */
-}
-
-static unsigned int real_irq_to_virt(unsigned int real_irq)
-{
-	unsigned int *ptr;
-
-	ptr = radix_tree_lookup(&irq_map, real_irq);
-	if (ptr == NULL)
-		return NO_IRQ;
-	return ptr - virt_irq_to_real_map;
-}
 
 #ifdef CONFIG_SMP
 static int get_irq_server(unsigned int irq)
@@ -264,17 +206,20 @@ static int get_irq_server(unsigned int irq)
 }
 #endif
 
-static void xics_enable_irq(unsigned int virq)
+
+static void xics_unmask_irq(unsigned int virq)
 {
 	unsigned int irq;
 	int call_status;
 	unsigned int server;
 
 	irq = virt_irq_to_real(irq_offset_down(virq));
-	if (irq == XICS_IPI)
+	WARN_ON(irq == NO_IRQ);
+	if (irq == XICS_IPI || irq == NO_IRQ)
 		return;
 
 	server = get_irq_server(virq);
+
 	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server,
 				DEFAULT_PRIORITY);
 	if (call_status != 0) {
@@ -293,7 +238,7 @@ static void xics_enable_irq(unsigned int virq)
 	}
 }
 
-static void xics_disable_real_irq(unsigned int irq)
+static void xics_mask_real_irq(unsigned int irq)
 {
 	int call_status;
 	unsigned int server;
@@ -318,75 +263,104 @@ static void xics_disable_real_irq(unsigned int irq)
 	}
 }
 
-static void xics_disable_irq(unsigned int virq)
+static void xics_mask_irq(unsigned int virq)
 {
 	unsigned int irq;
 
 	irq = virt_irq_to_real(irq_offset_down(virq));
-	xics_disable_real_irq(irq);
+	WARN_ON(irq == NO_IRQ);
+	if (irq != NO_IRQ)
+		xics_mask_real_irq(irq);
+}
+
+static void xics_set_irq_revmap(unsigned int virq)
+{
+	unsigned int irq;
+
+	irq = irq_offset_down(virq);
+	if (radix_tree_insert(&irq_map, virt_irq_to_real(irq),
+			      &virt_irq_to_real_map[irq]) == -ENOMEM)
+		printk(KERN_CRIT "Out of memory creating real -> virtual"
+		       " IRQ mapping for irq %u (real 0x%x)\n",
+		       virq, virt_irq_to_real(irq));
 }
 
-static void xics_end_irq(unsigned int irq)
+static unsigned int xics_startup(unsigned int virq)
+{
+	xics_set_irq_revmap(virq);
+	xics_unmask_irq(virq);
+	return 0;
+}
+
+static unsigned int real_irq_to_virt(unsigned int real_irq)
+{
+	unsigned int *ptr;
+
+	ptr = radix_tree_lookup(&irq_map, real_irq);
+	if (ptr == NULL)
+		return NO_IRQ;
+	return ptr - virt_irq_to_real_map;
+}
+
+static void xics_eoi_direct(unsigned int irq)
 {
 	int cpu = smp_processor_id();
 
 	iosync();
-	ops->xirr_info_set(cpu, ((0xff << 24) |
-				 (virt_irq_to_real(irq_offset_down(irq)))));
-
+	direct_xirr_info_set(cpu, ((0xff << 24) |
+				   (virt_irq_to_real(irq_offset_down(irq)))));
 }
 
-static void xics_mask_and_ack_irq(unsigned int irq)
+
+static void xics_eoi_lpar(unsigned int irq)
 {
 	int cpu = smp_processor_id();
 
-	if (irq < irq_offset_value()) {
-		i8259_pic.ack(irq);
-		iosync();
-		ops->xirr_info_set(cpu, ((0xff<<24) |
-					 xics_irq_8259_cascade_real));
-		iosync();
-	}
+	iosync();
+	lpar_xirr_info_set(cpu, ((0xff << 24) |
+				 (virt_irq_to_real(irq_offset_down(irq)))));
+
 }
 
-int xics_get_irq(struct pt_regs *regs)
+static inline int xics_remap_irq(int vec)
 {
-	unsigned int cpu = smp_processor_id();
-	unsigned int vec;
 	int irq;
 
-	vec = ops->xirr_info_get(cpu);
-	/*  (vec >> 24) == old priority */
 	vec &= 0x00ffffff;
 
-	/* for sanity, this had better be < NR_IRQS - 16 */
-	if (vec == xics_irq_8259_cascade_real) {
-		irq = i8259_irq(regs);
-		xics_end_irq(irq_offset_up(xics_irq_8259_cascade));
-	} else if (vec == XICS_IRQ_SPURIOUS) {
-		irq = -1;
-	} else {
-		irq = real_irq_to_virt(vec);
-		if (irq == NO_IRQ)
-			irq = real_irq_to_virt_slowpath(vec);
-		if (irq == NO_IRQ) {
-			printk(KERN_ERR "Interrupt %u (real) is invalid,"
-			       " disabling it.\n", vec);
-			xics_disable_real_irq(vec);
-		} else
-			irq = irq_offset_up(irq);
-	}
-	return irq;
+	if (vec == XICS_IRQ_SPURIOUS)
+		return NO_IRQ;
+
+	irq = real_irq_to_virt(vec);
+	if (irq == NO_IRQ)
+		irq = real_irq_to_virt_slowpath(vec);
+	if (likely(irq != NO_IRQ))
+		return irq_offset_up(irq);
+
+	printk(KERN_ERR "Interrupt %u (real) is invalid,"
+	       " disabling it.\n", vec);
+	xics_mask_real_irq(vec);
+	return NO_IRQ;
 }
 
-#ifdef CONFIG_SMP
+static int xics_get_irq_direct(struct pt_regs *regs)
+{
+	unsigned int cpu = smp_processor_id();
 
-static irqreturn_t xics_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
+	return xics_remap_irq(direct_xirr_info_get(cpu));
+}
+
+static int xics_get_irq_lpar(struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
+	unsigned int cpu = smp_processor_id();
 
-	ops->qirr_info(cpu, 0xff);
+	return xics_remap_irq(lpar_xirr_info_get(cpu));
+}
+
+#ifdef CONFIG_SMP
 
+static irqreturn_t xics_ipi_dispatch(int cpu, struct pt_regs *regs)
+{
 	WARN_ON(cpu_is_offline(cpu));
 
 	while (xics_ipi_message[cpu].value) {
@@ -418,18 +392,108 @@ static irqreturn_t xics_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t xics_ipi_action_direct(int irq, void *dev_id, struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	direct_qirr_info(cpu, 0xff);
+
+	return xics_ipi_dispatch(cpu, regs);
+}
+
+static irqreturn_t xics_ipi_action_lpar(int irq, void *dev_id, struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	lpar_qirr_info(cpu, 0xff);
+
+	return xics_ipi_dispatch(cpu, regs);
+}
+
 void xics_cause_IPI(int cpu)
 {
-	ops->qirr_info(cpu, IPI_PRIORITY);
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		lpar_qirr_info(cpu, IPI_PRIORITY);
+	else
+		direct_qirr_info(cpu, IPI_PRIORITY);
 }
+
 #endif /* CONFIG_SMP */
 
+static void xics_set_cpu_priority(int cpu, unsigned char cppr)
+{
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		lpar_cppr_info(cpu, cppr);
+	else
+		direct_cppr_info(cpu, cppr);
+	iosync();
+}
+
+static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+{
+	unsigned int irq;
+	int status;
+	int xics_status[2];
+	unsigned long newmask;
+	cpumask_t tmp = CPU_MASK_NONE;
+
+	irq = virt_irq_to_real(irq_offset_down(virq));
+	if (irq == XICS_IPI || irq == NO_IRQ)
+		return;
+
+	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
+
+	if (status) {
+		printk(KERN_ERR "xics_set_affinity: irq=%u ibm,get-xive "
+		       "returns %d\n", irq, status);
+		return;
+	}
+
+	/* For the moment only implement delivery to all cpus or one cpu */
+	if (cpus_equal(cpumask, CPU_MASK_ALL)) {
+		newmask = default_distrib_server;
+	} else {
+		cpus_and(tmp, cpu_online_map, cpumask);
+		if (cpus_empty(tmp))
+			return;
+		newmask = get_hard_smp_processor_id(first_cpu(tmp));
+	}
+
+	status = rtas_call(ibm_set_xive, 3, 1, NULL,
+				irq, newmask, xics_status[1]);
+
+	if (status) {
+		printk(KERN_ERR "xics_set_affinity: irq=%u ibm,set-xive "
+		       "returns %d\n", irq, status);
+		return;
+	}
+}
+
+static struct irq_chip xics_pic_direct = {
+	.typename = " XICS     ",
+	.startup = xics_startup,
+	.mask = xics_mask_irq,
+	.unmask = xics_unmask_irq,
+	.eoi = xics_eoi_direct,
+	.set_affinity = xics_set_affinity
+};
+
+
+static struct irq_chip xics_pic_lpar = {
+	.typename = " XICS     ",
+	.startup = xics_startup,
+	.mask = xics_mask_irq,
+	.unmask = xics_unmask_irq,
+	.eoi = xics_eoi_lpar,
+	.set_affinity = xics_set_affinity
+};
+
+
 void xics_setup_cpu(void)
 {
 	int cpu = smp_processor_id();
 
-	ops->cppr_info(cpu, 0xff);
-	iosync();
+	xics_set_cpu_priority(cpu, 0xff);
 
 	/*
 	 * Put the calling processor into the GIQ.  This is really only
@@ -453,6 +517,7 @@ void xics_init_IRQ(void)
 		unsigned long addr;
 		unsigned long size;
 	} intnodes[NR_CPUS];
+	struct irq_chip *chip;
 
 	ppc64_boot_msg(0x20, "XICS Init");
 
@@ -519,26 +584,10 @@ nextnode:
 	intr_base = intnodes[0].addr;
 	intr_size = intnodes[0].size;
 
-	np = of_find_node_by_type(NULL, "interrupt-controller");
-	if (!np) {
-		printk(KERN_DEBUG "xics: no ISA interrupt controller\n");
-		xics_irq_8259_cascade_real = -1;
-		xics_irq_8259_cascade = -1;
-	} else {
-		ireg = (uint *) get_property(np, "interrupts", NULL);
-		if (!ireg)
-			panic("xics_init_IRQ: can't find ISA interrupts property");
-
-		xics_irq_8259_cascade_real = *ireg;
-		xics_irq_8259_cascade
-			= virt_irq_create_mapping(xics_irq_8259_cascade_real);
-		i8259_init(0, 0);
-		of_node_put(np);
-	}
-
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		ops = &pSeriesLP_ops;
-	else {
+ 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ 		ppc_md.get_irq = xics_get_irq_lpar;
+ 		chip = &xics_pic_lpar;
+  	} else {
 #ifdef CONFIG_SMP
 		for_each_possible_cpu(i) {
 			int hard_id;
@@ -554,32 +603,54 @@ nextnode:
 #else
 		xics_per_cpu[0] = ioremap(intr_base, intr_size);
 #endif /* CONFIG_SMP */
+		ppc_md.get_irq = xics_get_irq_direct;
+		chip = &xics_pic_direct;
+
 	}
 
-	for (i = irq_offset_value(); i < NR_IRQS; ++i)
-		get_irq_desc(i)->chip = &xics_pic;
+	for (i = irq_offset_value(); i < NR_IRQS; ++i) {
+		/* All IRQs on XICS are level for now. MSI code may want to modify
+		 * that for reporting purposes
+		 */
+		get_irq_desc(i)->status |= IRQ_LEVEL;
+		set_irq_chip_and_handler(i, chip, handle_fasteoi_irq);
+	}
 
 	xics_setup_cpu();
 
 	ppc64_boot_msg(0x21, "XICS Done");
 }
 
-/*
- * We cant do this in init_IRQ because we need the memory subsystem up for
- * request_irq()
- */
-static int __init xics_setup_i8259(void)
-{
-	if (ppc64_interrupt_controller == IC_PPC_XIC &&
-	    xics_irq_8259_cascade != -1) {
-		if (request_irq(irq_offset_up(xics_irq_8259_cascade),
-				no_action, 0, "8259 cascade", NULL))
-			printk(KERN_ERR "xics_setup_i8259: couldn't get 8259 "
-					"cascade\n");
+static int xics_setup_8259_cascade(void)
+{
+	struct device_node *np;
+	uint *ireg;
+
+	np = of_find_node_by_type(NULL, "interrupt-controller");
+	if (np == NULL) {
+		printk(KERN_WARNING "xics: no ISA interrupt controller\n");
+		xics_irq_8259_cascade_real = -1;
+		xics_irq_8259_cascade = -1;
+		return 0;
 	}
+
+	ireg = (uint *) get_property(np, "interrupts", NULL);
+	if (!ireg)
+		panic("xics_init_IRQ: can't find ISA interrupts property");
+
+	xics_irq_8259_cascade_real = *ireg;
+	xics_irq_8259_cascade = irq_offset_up
+		(virt_irq_create_mapping(xics_irq_8259_cascade_real));
+	i8259_init(0, 0);
+	of_node_put(np);
+
+	xics_set_irq_revmap(xics_irq_8259_cascade);
+	set_irq_chained_handler(xics_irq_8259_cascade, pSeries_8259_cascade);
+
 	return 0;
 }
-arch_initcall(xics_setup_i8259);
+arch_initcall(xics_setup_8259_cascade);
+
 
 #ifdef CONFIG_SMP
 void xics_request_IPIs(void)
@@ -590,61 +661,22 @@ void xics_request_IPIs(void)
 	 * IPIs are marked IRQF_DISABLED as they must run with irqs
 	 * disabled
 	 */
-	request_irq(irq_offset_up(XICS_IPI), xics_ipi_action,
-		    IRQF_DISABLED, "IPI", NULL);
-	get_irq_desc(irq_offset_up(XICS_IPI))->status |= IRQ_PER_CPU;
-}
-#endif
-
-static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
-{
-	unsigned int irq;
-	int status;
-	int xics_status[2];
-	unsigned long newmask;
-	cpumask_t tmp = CPU_MASK_NONE;
-
-	irq = virt_irq_to_real(irq_offset_down(virq));
-	if (irq == XICS_IPI || irq == NO_IRQ)
-		return;
-
-	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
-
-	if (status) {
-		printk(KERN_ERR "xics_set_affinity: irq=%u ibm,get-xive "
-		       "returns %d\n", irq, status);
-		return;
-	}
-
-	/* For the moment only implement delivery to all cpus or one cpu */
-	if (cpus_equal(cpumask, CPU_MASK_ALL)) {
-		newmask = default_distrib_server;
-	} else {
-		cpus_and(tmp, cpu_online_map, cpumask);
-		if (cpus_empty(tmp))
-			return;
-		newmask = get_hard_smp_processor_id(first_cpu(tmp));
-	}
-
-	status = rtas_call(ibm_set_xive, 3, 1, NULL,
-				irq, newmask, xics_status[1]);
-
-	if (status) {
-		printk(KERN_ERR "xics_set_affinity: irq=%u ibm,set-xive "
-		       "returns %d\n", irq, status);
-		return;
-	}
+	set_irq_handler(irq_offset_up(XICS_IPI), handle_percpu_irq);
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		request_irq(irq_offset_up(XICS_IPI), xics_ipi_action_lpar,
+			    SA_INTERRUPT, "IPI", NULL);
+	else
+		request_irq(irq_offset_up(XICS_IPI), xics_ipi_action_direct,
+			    SA_INTERRUPT, "IPI", NULL);
 }
+#endif /* CONFIG_SMP */
 
 void xics_teardown_cpu(int secondary)
 {
+	struct irq_desc *desc = get_irq_desc(irq_offset_up(XICS_IPI));
 	int cpu = smp_processor_id();
 
-	ops->cppr_info(cpu, 0x00);
-	iosync();
-
-	/* Clear IPI */
-	ops->qirr_info(cpu, 0xff);
+ 	xics_set_cpu_priority(cpu, 0);
 
 	/*
 	 * we need to EOI the IPI if we got here from kexec down IPI
@@ -653,7 +685,8 @@ void xics_teardown_cpu(int secondary)
 	 * should we be flagging idle loop instead?
 	 * or creating some task to be scheduled?
 	 */
-	ops->xirr_info_set(cpu, XICS_IPI);
+	if (desc->chip && desc->chip->eoi)
+		desc->chip->eoi(XICS_IPI);
 
 	/*
 	 * Some machines need to have at least one cpu in the GIQ,
@@ -674,8 +707,7 @@ void xics_migrate_irqs_away(void)
 	unsigned int irq, virq, cpu = smp_processor_id();
 
 	/* Reject any interrupt that was queued to us... */
-	ops->cppr_info(cpu, 0);
-	iosync();
+	xics_set_cpu_priority(cpu, 0);
 
 	/* remove ourselves from the global interrupt queue */
 	status = rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
@@ -683,11 +715,10 @@ void xics_migrate_irqs_away(void)
 	WARN_ON(status < 0);
 
 	/* Allow IPIs again... */
-	ops->cppr_info(cpu, DEFAULT_PRIORITY);
-	iosync();
+	xics_set_cpu_priority(cpu, DEFAULT_PRIORITY);
 
 	for_each_irq(virq) {
-		irq_desc_t *desc;
+		struct irq_desc *desc;
 		int xics_status[2];
 		unsigned long flags;
 
diff --git a/arch/powerpc/platforms/pseries/xics.h b/arch/powerpc/platforms/pseries/xics.h
index e14c70868f1d..67dedf3514ec 100644
--- a/arch/powerpc/platforms/pseries/xics.h
+++ b/arch/powerpc/platforms/pseries/xics.h
@@ -14,13 +14,12 @@
 
 #include <linux/cache.h>
 
-void xics_init_IRQ(void);
-int xics_get_irq(struct pt_regs *);
-void xics_setup_cpu(void);
-void xics_teardown_cpu(int secondary);
-void xics_cause_IPI(int cpu);
-void xics_request_IPIs(void);
-void xics_migrate_irqs_away(void);
+extern void xics_init_IRQ(void);
+extern void xics_setup_cpu(void);
+extern void xics_teardown_cpu(int secondary);
+extern void xics_cause_IPI(int cpu);
+extern  void xics_request_IPIs(void);
+extern void xics_migrate_irqs_away(void);
 
 /* first argument is ignored for now*/
 void pSeriesLP_cppr_info(int n_cpu, u8 value);
@@ -31,4 +30,8 @@ struct xics_ipi_struct {
 
 extern struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
 
+struct irq_desc;
+extern void pSeries_8259_cascade(unsigned int irq, struct irq_desc *desc,
+				 struct pt_regs *regs);
+
 #endif /* _POWERPC_KERNEL_XICS_H */
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 1a3ef1ab9d6e..c2e9465871aa 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -69,11 +69,6 @@ int i8259_irq(struct pt_regs *regs)
 	return irq + i8259_pic_irq_offset;
 }
 
-int i8259_irq_cascade(struct pt_regs *regs, void *unused)
-{
-	return i8259_irq(regs);
-}
-
 static void i8259_mask_and_ack_irq(unsigned int irq_nr)
 {
 	unsigned long flags;
@@ -129,19 +124,11 @@ static void i8259_unmask_irq(unsigned int irq_nr)
 	spin_unlock_irqrestore(&i8259_lock, flags);
 }
 
-static void i8259_end_irq(unsigned int irq)
-{
-	if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS))
-	    && irq_desc[irq].action)
-		i8259_unmask_irq(irq);
-}
-
-struct hw_interrupt_type i8259_pic = {
-	.typename = " i8259    ",
-	.enable = i8259_unmask_irq,
-	.disable = i8259_mask_irq,
-	.ack = i8259_mask_and_ack_irq,
-	.end = i8259_end_irq,
+static struct irq_chip i8259_pic = {
+	.typename	= " i8259    ",
+	.mask		= i8259_mask_irq,
+	.unmask		= i8259_unmask_irq,
+	.mask_ack	= i8259_mask_and_ack_irq,
 };
 
 static struct resource pic1_iores = {
@@ -207,8 +194,11 @@ void __init i8259_init(unsigned long intack_addr, int offset)
 
 	spin_unlock_irqrestore(&i8259_lock, flags);
 
-	for (i = 0; i < NUM_ISA_INTERRUPTS; ++i)
-		irq_desc[offset + i].chip = &i8259_pic;
+	for (i = 0; i < NUM_ISA_INTERRUPTS; ++i) {
+		set_irq_chip_and_handler(offset + i, &i8259_pic,
+					 handle_level_irq);
+		irq_desc[offset + i].status |= IRQ_LEVEL;
+	}
 
 	/* reserve our resources */
 	setup_irq(offset + 2, &i8259_irqaction);
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 7e469358895f..9a95f16c19a5 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -100,8 +100,8 @@ static inline u32 _mpic_cpu_read(struct mpic *mpic, unsigned int reg)
 
 	if (mpic->flags & MPIC_PRIMARY)
 		cpu = hard_smp_processor_id();
-
-	return _mpic_read(mpic->flags & MPIC_BIG_ENDIAN, mpic->cpuregs[cpu], reg);
+	return _mpic_read(mpic->flags & MPIC_BIG_ENDIAN,
+			  mpic->cpuregs[cpu], reg);
 }
 
 static inline void _mpic_cpu_write(struct mpic *mpic, unsigned int reg, u32 value)
@@ -378,14 +378,14 @@ static inline u32 mpic_physmask(u32 cpumask)
 /* Get the mpic structure from the IPI number */
 static inline struct mpic * mpic_from_ipi(unsigned int ipi)
 {
-	return container_of(irq_desc[ipi].chip, struct mpic, hc_ipi);
+	return irq_desc[ipi].chip_data;
 }
 #endif
 
 /* Get the mpic structure from the irq number */
 static inline struct mpic * mpic_from_irq(unsigned int irq)
 {
-	return container_of(irq_desc[irq].chip, struct mpic, hc_irq);
+	return irq_desc[irq].chip_data;
 }
 
 /* Send an EOI */
@@ -410,7 +410,7 @@ static irqreturn_t mpic_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
  */
 
 
-static void mpic_enable_irq(unsigned int irq)
+static void mpic_unmask_irq(unsigned int irq)
 {
 	unsigned int loops = 100000;
 	struct mpic *mpic = mpic_from_irq(irq);
@@ -429,35 +429,9 @@ static void mpic_enable_irq(unsigned int irq)
 			break;
 		}
 	} while(mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) & MPIC_VECPRI_MASK);	
-
-#ifdef CONFIG_MPIC_BROKEN_U3
-	if (mpic->flags & MPIC_BROKEN_U3) {
-		unsigned int src = irq - mpic->irq_offset;
-		if (mpic_is_ht_interrupt(mpic, src) &&
-		    (irq_desc[irq].status & IRQ_LEVEL))
-			mpic_ht_end_irq(mpic, src);
-	}
-#endif /* CONFIG_MPIC_BROKEN_U3 */
-}
-
-static unsigned int mpic_startup_irq(unsigned int irq)
-{
-#ifdef CONFIG_MPIC_BROKEN_U3
-	struct mpic *mpic = mpic_from_irq(irq);
-	unsigned int src = irq - mpic->irq_offset;
-#endif /* CONFIG_MPIC_BROKEN_U3 */
-
-	mpic_enable_irq(irq);
-
-#ifdef CONFIG_MPIC_BROKEN_U3
-	if (mpic_is_ht_interrupt(mpic, src))
-		mpic_startup_ht_interrupt(mpic, src, irq_desc[irq].status);
-#endif /* CONFIG_MPIC_BROKEN_U3 */
-
-	return 0;
 }
 
-static void mpic_disable_irq(unsigned int irq)
+static void mpic_mask_irq(unsigned int irq)
 {
 	unsigned int loops = 100000;
 	struct mpic *mpic = mpic_from_irq(irq);
@@ -478,23 +452,58 @@ static void mpic_disable_irq(unsigned int irq)
 	} while(!(mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) & MPIC_VECPRI_MASK));
 }
 
-static void mpic_shutdown_irq(unsigned int irq)
+static void mpic_end_irq(unsigned int irq)
 {
+	struct mpic *mpic = mpic_from_irq(irq);
+
+#ifdef DEBUG_IRQ
+	DBG("%s: end_irq: %d\n", mpic->name, irq);
+#endif
+	/* We always EOI on end_irq() even for edge interrupts since that
+	 * should only lower the priority, the MPIC should have properly
+	 * latched another edge interrupt coming in anyway
+	 */
+
+	mpic_eoi(mpic);
+}
+
 #ifdef CONFIG_MPIC_BROKEN_U3
+
+static void mpic_unmask_ht_irq(unsigned int irq)
+{
 	struct mpic *mpic = mpic_from_irq(irq);
 	unsigned int src = irq - mpic->irq_offset;
 
-	if (mpic_is_ht_interrupt(mpic, src))
-		mpic_shutdown_ht_interrupt(mpic, src, irq_desc[irq].status);
+	mpic_unmask_irq(irq);
 
-#endif /* CONFIG_MPIC_BROKEN_U3 */
+	if (irq_desc[irq].status & IRQ_LEVEL)
+		mpic_ht_end_irq(mpic, src);
+}
+
+static unsigned int mpic_startup_ht_irq(unsigned int irq)
+{
+	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = irq - mpic->irq_offset;
 
-	mpic_disable_irq(irq);
+	mpic_unmask_irq(irq);
+	mpic_startup_ht_interrupt(mpic, src, irq_desc[irq].status);
+
+	return 0;
 }
 
-static void mpic_end_irq(unsigned int irq)
+static void mpic_shutdown_ht_irq(unsigned int irq)
+{
+	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = irq - mpic->irq_offset;
+
+	mpic_shutdown_ht_interrupt(mpic, src, irq_desc[irq].status);
+	mpic_mask_irq(irq);
+}
+
+static void mpic_end_ht_irq(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = irq - mpic->irq_offset;
 
 #ifdef DEBUG_IRQ
 	DBG("%s: end_irq: %d\n", mpic->name, irq);
@@ -504,21 +513,16 @@ static void mpic_end_irq(unsigned int irq)
 	 * latched another edge interrupt coming in anyway
 	 */
 
-#ifdef CONFIG_MPIC_BROKEN_U3
-	if (mpic->flags & MPIC_BROKEN_U3) {
-		unsigned int src = irq - mpic->irq_offset;
-		if (mpic_is_ht_interrupt(mpic, src) &&
-		    (irq_desc[irq].status & IRQ_LEVEL))
-			mpic_ht_end_irq(mpic, src);
-	}
-#endif /* CONFIG_MPIC_BROKEN_U3 */
-
+	if (irq_desc[irq].status & IRQ_LEVEL)
+		mpic_ht_end_irq(mpic, src);
 	mpic_eoi(mpic);
 }
 
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
 #ifdef CONFIG_SMP
 
-static void mpic_enable_ipi(unsigned int irq)
+static void mpic_unmask_ipi(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_ipi(irq);
 	unsigned int src = irq - mpic->ipi_offset;
@@ -527,7 +531,7 @@ static void mpic_enable_ipi(unsigned int irq)
 	mpic_ipi_write(src, mpic_ipi_read(src) & ~MPIC_VECPRI_MASK);
 }
 
-static void mpic_disable_ipi(unsigned int irq)
+static void mpic_mask_ipi(unsigned int irq)
 {
 	/* NEVER disable an IPI... that's just plain wrong! */
 }
@@ -560,6 +564,30 @@ static void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
 		       mpic_physmask(cpus_addr(tmp)[0]));	
 }
 
+static struct irq_chip mpic_irq_chip = {
+	.mask	= mpic_mask_irq,
+	.unmask	= mpic_unmask_irq,
+	.eoi	= mpic_end_irq,
+};
+
+#ifdef CONFIG_SMP
+static struct irq_chip mpic_ipi_chip = {
+	.mask	= mpic_mask_ipi,
+	.unmask	= mpic_unmask_ipi,
+	.eoi	= mpic_end_ipi,
+};
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+static struct irq_chip mpic_irq_ht_chip = {
+	.startup	= mpic_startup_ht_irq,
+	.shutdown	= mpic_shutdown_ht_irq,
+	.mask		= mpic_mask_irq,
+	.unmask		= mpic_unmask_ht_irq,
+	.eoi		= mpic_end_ht_irq,
+};
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
 
 /*
  * Exported functions
@@ -589,19 +617,19 @@ struct mpic * __init mpic_alloc(unsigned long phys_addr,
 	memset(mpic, 0, sizeof(struct mpic));
 	mpic->name = name;
 
+	mpic->hc_irq = mpic_irq_chip;
 	mpic->hc_irq.typename = name;
-	mpic->hc_irq.startup = mpic_startup_irq;
-	mpic->hc_irq.shutdown = mpic_shutdown_irq;
-	mpic->hc_irq.enable = mpic_enable_irq;
-	mpic->hc_irq.disable = mpic_disable_irq;
-	mpic->hc_irq.end = mpic_end_irq;
 	if (flags & MPIC_PRIMARY)
 		mpic->hc_irq.set_affinity = mpic_set_affinity;
+#ifdef CONFIG_MPIC_BROKEN_U3
+	mpic->hc_ht_irq = mpic_irq_ht_chip;
+	mpic->hc_ht_irq.typename = name;
+	if (flags & MPIC_PRIMARY)
+		mpic->hc_ht_irq.set_affinity = mpic_set_affinity;
+#endif /* CONFIG_MPIC_BROKEN_U3 */
 #ifdef CONFIG_SMP
 	mpic->hc_ipi.typename = name;
-	mpic->hc_ipi.enable = mpic_enable_ipi;
-	mpic->hc_ipi.disable = mpic_disable_ipi;
-	mpic->hc_ipi.end = mpic_end_ipi;
+	mpic->hc_ipi = mpic_ipi_chip;
 #endif /* CONFIG_SMP */
 
 	mpic->flags = flags;
@@ -697,28 +725,6 @@ void __init mpic_assign_isu(struct mpic *mpic, unsigned int isu_num,
 		mpic->num_sources = isu_first + mpic->isu_size;
 }
 
-void __init mpic_setup_cascade(unsigned int irq, mpic_cascade_t handler,
-			       void *data)
-{
-	struct mpic *mpic = mpic_find(irq, NULL);
-	unsigned long flags;
-
-	/* Synchronization here is a bit dodgy, so don't try to replace cascade
-	 * interrupts on the fly too often ... but normally it's set up at boot.
-	 */
-	spin_lock_irqsave(&mpic_lock, flags);
-	if (mpic->cascade)	       
-		mpic_disable_irq(mpic->cascade_vec + mpic->irq_offset);
-	mpic->cascade = NULL;
-	wmb();
-	mpic->cascade_vec = irq - mpic->irq_offset;
-	mpic->cascade_data = data;
-	wmb();
-	mpic->cascade = handler;
-	mpic_enable_irq(irq);
-	spin_unlock_irqrestore(&mpic_lock, flags);
-}
-
 void __init mpic_init(struct mpic *mpic)
 {
 	int i;
@@ -750,8 +756,10 @@ void __init mpic_init(struct mpic *mpic)
 #ifdef CONFIG_SMP
 		if (!(mpic->flags & MPIC_PRIMARY))
 			continue;
-		irq_desc[mpic->ipi_offset+i].status |= IRQ_PER_CPU;
-		irq_desc[mpic->ipi_offset+i].chip = &mpic->hc_ipi;
+		set_irq_chip_data(mpic->ipi_offset+i, mpic);
+		set_irq_chip_and_handler(mpic->ipi_offset+i,
+					 &mpic->hc_ipi,
+					 handle_percpu_irq);
 #endif /* CONFIG_SMP */
 	}
 
@@ -763,7 +771,7 @@ void __init mpic_init(struct mpic *mpic)
 	/* Do the HT PIC fixups on U3 broken mpic */
 	DBG("MPIC flags: %x\n", mpic->flags);
 	if ((mpic->flags & MPIC_BROKEN_U3) && (mpic->flags & MPIC_PRIMARY))
-		mpic_scan_ht_pics(mpic);
+ 		mpic_scan_ht_pics(mpic);
 #endif /* CONFIG_MPIC_BROKEN_U3 */
 
 	for (i = 0; i < mpic->num_sources; i++) {
@@ -811,8 +819,17 @@ void __init mpic_init(struct mpic *mpic)
 
 		/* init linux descriptors */
 		if (i < mpic->irq_count) {
-			irq_desc[mpic->irq_offset+i].status = level ? IRQ_LEVEL : 0;
-			irq_desc[mpic->irq_offset+i].chip = &mpic->hc_irq;
+			struct irq_chip *chip = &mpic->hc_irq;
+
+			irq_desc[mpic->irq_offset+i].status |=
+				level ? IRQ_LEVEL : 0;
+#ifdef CONFIG_MPIC_BROKEN_U3
+			if (mpic_is_ht_interrupt(mpic, i))
+				chip = &mpic->hc_ht_irq;
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+			set_irq_chip_data(mpic->irq_offset+i, mpic);
+			set_irq_chip_and_handler(mpic->irq_offset+i, chip,
+						 handle_fasteoi_irq);
 		}
 	}
 	
@@ -986,14 +1003,6 @@ int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs)
 #ifdef DEBUG_LOW
 	DBG("%s: get_one_irq(): %d\n", mpic->name, irq);
 #endif
-	if (mpic->cascade && irq == mpic->cascade_vec) {
-#ifdef DEBUG_LOW
-		DBG("%s: cascading ...\n", mpic->name);
-#endif
-		irq = mpic->cascade(regs, mpic->cascade_data);
-		mpic_eoi(mpic);
-		return irq;
-	}
 	if (unlikely(irq == MPIC_VEC_SPURRIOUS))
 		return -1;
 	if (irq < MPIC_VEC_IPI_0) {
diff --git a/include/asm-powerpc/i8259.h b/include/asm-powerpc/i8259.h
index 0392159e16e4..ff31cb90325f 100644
--- a/include/asm-powerpc/i8259.h
+++ b/include/asm-powerpc/i8259.h
@@ -4,11 +4,8 @@
 
 #include <linux/irq.h>
 
-extern struct hw_interrupt_type i8259_pic;
-
 extern void i8259_init(unsigned long intack_addr, int offset);
 extern int i8259_irq(struct pt_regs *regs);
-extern int i8259_irq_cascade(struct pt_regs *regs, void *unused);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_I8259_H */
diff --git a/include/asm-powerpc/irq.h b/include/asm-powerpc/irq.h
index eb5f33e1977a..13fa2ef38dc7 100644
--- a/include/asm-powerpc/irq.h
+++ b/include/asm-powerpc/irq.h
@@ -514,9 +514,12 @@ extern u64 ppc64_interrupt_controller;
 
 #endif
 
+#ifndef CONFIG_PPC_MERGE
 #define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
 /* pedantic: these are long because they are used with set_bit --RR */
 extern unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
+#endif
+
 extern atomic_t ppc_n_lost_interrupts;
 
 #define virt_irq_create_mapping(x)	(x)
@@ -579,9 +582,8 @@ extern struct thread_info *softirq_ctx[NR_CPUS];
 
 extern void irq_ctx_init(void);
 extern void call_do_softirq(struct thread_info *tp);
-extern int call___do_IRQ(int irq, struct pt_regs *regs,
-		struct thread_info *tp);
-
+extern int call_handle_irq(int irq, void *p1, void *p2,
+			   struct thread_info *tp, void *func);
 #else
 #define irq_ctx_init()
 
diff --git a/include/asm-powerpc/mpic.h b/include/asm-powerpc/mpic.h
index f0d22ac34b96..a2277cb77ddc 100644
--- a/include/asm-powerpc/mpic.h
+++ b/include/asm-powerpc/mpic.h
@@ -114,9 +114,6 @@
 #define MPIC_VEC_TIMER_1	248
 #define MPIC_VEC_TIMER_0	247
 
-/* Type definition of the cascade handler */
-typedef int (*mpic_cascade_t)(struct pt_regs *regs, void *data);
-
 #ifdef CONFIG_MPIC_BROKEN_U3
 /* Fixup table entry */
 struct mpic_irq_fixup
@@ -133,9 +130,12 @@ struct mpic_irq_fixup
 struct mpic
 {
 	/* The "linux" controller struct */
-	hw_irq_controller	hc_irq;
+	struct irq_chip		hc_irq;
+#ifdef CONFIG_MPIC_BROKEN_U3
+	struct irq_chip		hc_ht_irq;
+#endif
 #ifdef CONFIG_SMP
-	hw_irq_controller	hc_ipi;
+	struct irq_chip		hc_ipi;
 #endif
 	const char		*name;
 	/* Flags */
@@ -153,10 +153,6 @@ struct mpic
 	unsigned int		num_sources;
 	/* Number of CPUs */
 	unsigned int		num_cpus;
-	/* cascade handler */
-	mpic_cascade_t		cascade;
-	void			*cascade_data;
-	unsigned int		cascade_vec;
 	/* senses array */
 	unsigned char		*senses;
 	unsigned int		senses_count;
@@ -237,17 +233,6 @@ extern void mpic_assign_isu(struct mpic *mpic, unsigned int isu_num,
  */
 extern void mpic_init(struct mpic *mpic);
 
-/* Setup a cascade. Currently, only one cascade is supported this
- * way, though you can always do a normal request_irq() and add
- * other cascades this way. You should call this _after_ having
- * added all the ISUs
- *
- * @irq_no:	"linux" irq number of the cascade (that is offset'ed vector)
- * @handler:	cascade handler function
- */
-extern void mpic_setup_cascade(unsigned int irq_no, mpic_cascade_t hanlder,
-			       void *data);
-
 /*
  * All of the following functions must only be used after the
  * ISUs have been assigned and the controller fully initialized
-- 
cgit v1.2.3


From cc9fd71c62f542233c412b5fabc1bbe0c4d5ad08 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 3 Jul 2006 19:35:17 +1000
Subject: [POWERPC] New device-tree interrupt parsing code

Adds new routines to prom_parse to walk the device-tree for interrupt
information. This includes both direct mapping of interrupts and low
level parsing functions for use with partial trees.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/prom_parse.c | 443 ++++++++++++++++++++++++++++++++++++---
 include/asm-powerpc/prom.h       |  87 ++++++++
 2 files changed, 506 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 45df420383cc..21009b1f7869 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -38,14 +38,6 @@ static void of_dump_addr(const char *s, u32 *addr, int na)
 static void of_dump_addr(const char *s, u32 *addr, int na) { }
 #endif
 
-/* Read a big address */
-static inline u64 of_read_addr(u32 *cell, int size)
-{
-	u64 r = 0;
-	while (size--)
-		r = (r << 32) | *(cell++);
-	return r;
-}
 
 /* Callbacks for bus specific translators */
 struct of_bus {
@@ -77,9 +69,9 @@ static u64 of_bus_default_map(u32 *addr, u32 *range, int na, int ns, int pna)
 {
 	u64 cp, s, da;
 
-	cp = of_read_addr(range, na);
-	s  = of_read_addr(range + na + pna, ns);
-	da = of_read_addr(addr, na);
+	cp = of_read_number(range, na);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr, na);
 
 	DBG("OF: default map, cp="PRu64", s="PRu64", da="PRu64"\n",
 	    cp, s, da);
@@ -91,7 +83,7 @@ static u64 of_bus_default_map(u32 *addr, u32 *range, int na, int ns, int pna)
 
 static int of_bus_default_translate(u32 *addr, u64 offset, int na)
 {
-	u64 a = of_read_addr(addr, na);
+	u64 a = of_read_number(addr, na);
 	memset(addr, 0, na * 4);
 	a += offset;
 	if (na > 1)
@@ -135,9 +127,9 @@ static u64 of_bus_pci_map(u32 *addr, u32 *range, int na, int ns, int pna)
 		return OF_BAD_ADDR;
 
 	/* Read address values, skipping high cell */
-	cp = of_read_addr(range + 1, na - 1);
-	s  = of_read_addr(range + na + pna, ns);
-	da = of_read_addr(addr + 1, na - 1);
+	cp = of_read_number(range + 1, na - 1);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr + 1, na - 1);
 
 	DBG("OF: PCI map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
 
@@ -195,9 +187,9 @@ static u64 of_bus_isa_map(u32 *addr, u32 *range, int na, int ns, int pna)
 		return OF_BAD_ADDR;
 
 	/* Read address values, skipping high cell */
-	cp = of_read_addr(range + 1, na - 1);
-	s  = of_read_addr(range + na + pna, ns);
-	da = of_read_addr(addr + 1, na - 1);
+	cp = of_read_number(range + 1, na - 1);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr + 1, na - 1);
 
 	DBG("OF: ISA map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
 
@@ -295,7 +287,7 @@ static int of_translate_one(struct device_node *parent, struct of_bus *bus,
 	 */
 	ranges = (u32 *)get_property(parent, "ranges", &rlen);
 	if (ranges == NULL || rlen == 0) {
-		offset = of_read_addr(addr, na);
+		offset = of_read_number(addr, na);
 		memset(addr, 0, pna * 4);
 		DBG("OF: no ranges, 1:1 translation\n");
 		goto finish;
@@ -378,7 +370,7 @@ u64 of_translate_address(struct device_node *dev, u32 *in_addr)
 		/* If root, we have finished */
 		if (parent == NULL) {
 			DBG("OF: reached root node\n");
-			result = of_read_addr(addr, na);
+			result = of_read_number(addr, na);
 			break;
 		}
 
@@ -442,7 +434,7 @@ u32 *of_get_address(struct device_node *dev, int index, u64 *size,
 	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
 		if (i == index) {
 			if (size)
-				*size = of_read_addr(prop + na, ns);
+				*size = of_read_number(prop + na, ns);
 			if (flags)
 				*flags = bus->get_flags(prop);
 			return prop;
@@ -484,7 +476,7 @@ u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
 	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
 		if ((prop[0] & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
 			if (size)
-				*size = of_read_addr(prop + na, ns);
+				*size = of_read_number(prop + na, ns);
 			if (flags)
 				*flags = bus->get_flags(prop);
 			return prop;
@@ -565,11 +557,414 @@ void of_parse_dma_window(struct device_node *dn, unsigned char *dma_window_prop,
 		prop = get_property(dn, "#address-cells", NULL);
 
 	cells = prop ? *(u32 *)prop : prom_n_addr_cells(dn);
-	*phys = of_read_addr(dma_window, cells);
+	*phys = of_read_number(dma_window, cells);
 
 	dma_window += cells;
 
 	prop = get_property(dn, "ibm,#dma-size-cells", NULL);
 	cells = prop ? *(u32 *)prop : prom_n_size_cells(dn);
-	*size = of_read_addr(dma_window, cells);
+	*size = of_read_number(dma_window, cells);
+}
+
+/*
+ * Interrupt remapper
+ */
+
+static unsigned int of_irq_workarounds;
+static struct device_node *of_irq_dflt_pic;
+
+static struct device_node *of_irq_find_parent(struct device_node *child)
+{
+	struct device_node *p;
+	phandle *parp;
+
+	if (!of_node_get(child))
+		return NULL;
+
+	do {
+		parp = (phandle *)get_property(child, "interrupt-parent", NULL);
+		if (parp == NULL)
+			p = of_get_parent(child);
+		else {
+			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
+				p = of_node_get(of_irq_dflt_pic);
+			else
+				p = of_find_node_by_phandle(*parp);
+		}
+		of_node_put(child);
+		child = p;
+	} while (p && get_property(p, "#interrupt-cells", NULL) == NULL);
+
+	return p;
+}
+
+static u8 of_irq_pci_swizzle(u8 slot, u8 pin)
+{
+	return (((pin - 1) + slot) % 4) + 1;
 }
+
+/* This doesn't need to be called if you don't have any special workaround
+ * flags to pass
+ */
+void of_irq_map_init(unsigned int flags)
+{
+	of_irq_workarounds = flags;
+
+	/* OldWorld, don't bother looking at other things */
+	if (flags & OF_IMAP_OLDWORLD_MAC)
+		return;
+
+	/* If we don't have phandles, let's try to locate a default interrupt
+	 * controller (happens when booting with BootX). We do a first match
+	 * here, hopefully, that only ever happens on machines with one
+	 * controller.
+	 */
+	if (flags & OF_IMAP_NO_PHANDLE) {
+		struct device_node *np;
+
+		for(np = NULL; (np = of_find_all_nodes(np)) != NULL;) {
+			if (get_property(np, "interrupt-controller", NULL)
+			    == NULL)
+				continue;
+			/* Skip /chosen/interrupt-controller */
+			if (strcmp(np->name, "chosen") == 0)
+				continue;
+			/* It seems like at least one person on this planet wants
+			 * to use BootX on a machine with an AppleKiwi controller
+			 * which happens to pretend to be an interrupt
+			 * controller too.
+			 */
+			if (strcmp(np->name, "AppleKiwi") == 0)
+				continue;
+			/* I think we found one ! */
+			of_irq_dflt_pic = np;
+			break;
+		}
+	}
+
+}
+
+int of_irq_map_raw(struct device_node *parent, u32 *intspec, u32 *addr,
+		   struct of_irq *out_irq)
+{
+	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
+	u32 *tmp, *imap, *imask;
+	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
+	int imaplen, match, i;
+
+	ipar = of_node_get(parent);
+
+	/* First get the #interrupt-cells property of the current cursor
+	 * that tells us how to interpret the passed-in intspec. If there
+	 * is none, we are nice and just walk up the tree
+	 */
+	do {
+		tmp = (u32 *)get_property(ipar, "#interrupt-cells", NULL);
+		if (tmp != NULL) {
+			intsize = *tmp;
+			break;
+		}
+		tnode = ipar;
+		ipar = of_irq_find_parent(ipar);
+		of_node_put(tnode);
+	} while (ipar);
+	if (ipar == NULL) {
+		DBG(" -> no parent found !\n");
+		goto fail;
+	}
+
+	DBG("of_irq_map_raw: ipar=%s, size=%d\n", ipar->full_name, intsize);
+
+	/* Look for this #address-cells. We have to implement the old linux
+	 * trick of looking for the parent here as some device-trees rely on it
+	 */
+	old = of_node_get(ipar);
+	do {
+		tmp = (u32 *)get_property(old, "#address-cells", NULL);
+		tnode = of_get_parent(old);
+		of_node_put(old);
+		old = tnode;
+	} while(old && tmp == NULL);
+	of_node_put(old);
+	old = NULL;
+	addrsize = (tmp == NULL) ? 2 : *tmp;
+
+	DBG(" -> addrsize=%d\n", addrsize);
+
+	/* Now start the actual "proper" walk of the interrupt tree */
+	while (ipar != NULL) {
+		/* Now check if cursor is an interrupt-controller and if it is
+		 * then we are done
+		 */
+		if (get_property(ipar, "interrupt-controller", NULL) != NULL) {
+			DBG(" -> got it !\n");
+			memcpy(out_irq->specifier, intspec,
+			       intsize * sizeof(u32));
+			out_irq->size = intsize;
+			out_irq->controller = ipar;
+			of_node_put(old);
+			return 0;
+		}
+
+		/* Now look for an interrupt-map */
+		imap = (u32 *)get_property(ipar, "interrupt-map", &imaplen);
+		/* No interrupt map, check for an interrupt parent */
+		if (imap == NULL) {
+			DBG(" -> no map, getting parent\n");
+			newpar = of_irq_find_parent(ipar);
+			goto skiplevel;
+		}
+		imaplen /= sizeof(u32);
+
+		/* Look for a mask */
+		imask = (u32 *)get_property(ipar, "interrupt-map-mask", NULL);
+
+		/* If we were passed no "reg" property and we attempt to parse
+		 * an interrupt-map, then #address-cells must be 0.
+		 * Fail if it's not.
+		 */
+		if (addr == NULL && addrsize != 0) {
+			DBG(" -> no reg passed in when needed !\n");
+			goto fail;
+		}
+
+		/* Parse interrupt-map */
+		match = 0;
+		while (imaplen > (addrsize + intsize + 1) && !match) {
+			/* Compare specifiers */
+			match = 1;
+			for (i = 0; i < addrsize && match; ++i) {
+				u32 mask = imask ? imask[i] : 0xffffffffu;
+				match = ((addr[i] ^ imap[i]) & mask) == 0;
+			}
+			for (; i < (addrsize + intsize) && match; ++i) {
+				u32 mask = imask ? imask[i] : 0xffffffffu;
+				match =
+				   ((intspec[i-addrsize] ^ imap[i]) & mask) == 0;
+			}
+			imap += addrsize + intsize;
+			imaplen -= addrsize + intsize;
+
+			DBG(" -> match=%d (imaplen=%d)\n", match, imaplen);
+
+			/* Get the interrupt parent */
+			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
+				newpar = of_node_get(of_irq_dflt_pic);
+			else
+				newpar = of_find_node_by_phandle((phandle)*imap);
+			imap++;
+			--imaplen;
+
+			/* Check if not found */
+			if (newpar == NULL) {
+				DBG(" -> imap parent not found !\n");
+				goto fail;
+			}
+
+			/* Get #interrupt-cells and #address-cells of new
+			 * parent
+			 */
+			tmp = (u32 *)get_property(newpar, "#interrupt-cells",
+						  NULL);
+			if (tmp == NULL) {
+				DBG(" -> parent lacks #interrupt-cells !\n");
+				goto fail;
+			}
+			newintsize = *tmp;
+			tmp = (u32 *)get_property(newpar, "#address-cells",
+						  NULL);
+			newaddrsize = (tmp == NULL) ? 0 : *tmp;
+
+			DBG(" -> newintsize=%d, newaddrsize=%d\n",
+			    newintsize, newaddrsize);
+
+			/* Check for malformed properties */
+			if (imaplen < (newaddrsize + newintsize))
+				goto fail;
+
+			imap += newaddrsize + newintsize;
+			imaplen -= newaddrsize + newintsize;
+
+			DBG(" -> imaplen=%d\n", imaplen);
+		}
+		if (!match)
+			goto fail;
+
+		of_node_put(old);
+		old = of_node_get(newpar);
+		addrsize = newaddrsize;
+		intsize = newintsize;
+		intspec = imap - intsize;
+		addr = intspec - addrsize;
+
+	skiplevel:
+		/* Iterate again with new parent */
+		DBG(" -> new parent: %s\n", newpar ? newpar->full_name : "<>");
+		of_node_put(ipar);
+		ipar = newpar;
+		newpar = NULL;
+	}
+ fail:
+	of_node_put(ipar);
+	of_node_put(old);
+	of_node_put(newpar);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(of_irq_map_raw);
+
+#if defined(CONFIG_PPC_PMAC) && defined(CONFIG_PPC32)
+static int of_irq_map_oldworld(struct device_node *device, int index,
+			       struct of_irq *out_irq)
+{
+	u32 *ints;
+	int intlen;
+
+	/*
+	 * Old machines just have a list of interrupt numbers
+	 * and no interrupt-controller nodes.
+	 */
+	ints = (u32 *) get_property(device, "AAPL,interrupts", &intlen);
+	if (ints == NULL)
+		return -EINVAL;
+	intlen /= sizeof(u32);
+
+	if (index >= intlen)
+		return -EINVAL;
+
+	out_irq->controller = NULL;
+	out_irq->specifier[0] = ints[index];
+	out_irq->size = 1;
+
+	return 0;
+}
+#else /* defined(CONFIG_PPC_PMAC) && defined(CONFIG_PPC32) */
+static int of_irq_map_oldworld(struct device_node *device, int index,
+			       struct of_irq *out_irq)
+{
+	return -EINVAL;
+}
+#endif /* !(defined(CONFIG_PPC_PMAC) && defined(CONFIG_PPC32)) */
+
+int of_irq_map_one(struct device_node *device, int index, struct of_irq *out_irq)
+{
+	struct device_node *p;
+	u32 *intspec, *tmp, intsize, intlen, *addr;
+	int res;
+
+	DBG("of_irq_map_one: dev=%s, index=%d\n", device->full_name, index);
+
+	/* OldWorld mac stuff is "special", handle out of line */
+	if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC)
+		return of_irq_map_oldworld(device, index, out_irq);
+
+	/* Get the interrupts property */
+	intspec = (u32 *)get_property(device, "interrupts", &intlen);
+	if (intspec == NULL)
+		return -EINVAL;
+	intlen /= sizeof(u32);
+
+	/* Get the reg property (if any) */
+	addr = (u32 *)get_property(device, "reg", NULL);
+
+	/* Look for the interrupt parent. */
+	p = of_irq_find_parent(device);
+	if (p == NULL)
+		return -EINVAL;
+
+	/* Get size of interrupt specifier */
+	tmp = (u32 *)get_property(p, "#interrupt-cells", NULL);
+	if (tmp == NULL) {
+		of_node_put(p);
+		return -EINVAL;
+	}
+	intsize = *tmp;
+
+	/* Check index */
+	if (index * intsize >= intlen)
+		return -EINVAL;
+
+	/* Get new specifier and map it */
+	res = of_irq_map_raw(p, intspec + index * intsize, addr, out_irq);
+	of_node_put(p);
+	return res;
+}
+EXPORT_SYMBOL_GPL(of_irq_map_one);
+
+int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
+{
+	struct device_node *dn, *ppnode;
+	struct pci_dev *ppdev;
+	u32 lspec;
+	u32 laddr[3];
+	u8 pin;
+	int rc;
+
+	/* Check if we have a device node, if yes, fallback to standard OF
+	 * parsing
+	 */
+	dn = pci_device_to_OF_node(pdev);
+	if (dn)
+		return of_irq_map_one(dn, 0, out_irq);
+
+	/* Ok, we don't, time to have fun. Let's start by building up an
+	 * interrupt spec.  we assume #interrupt-cells is 1, which is standard
+	 * for PCI. If you do different, then don't use that routine.
+	 */
+	rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
+	if (rc != 0)
+		return rc;
+	/* No pin, exit */
+	if (pin == 0)
+		return -ENODEV;
+
+	/* Now we walk up the PCI tree */
+	lspec = pin;
+	for (;;) {
+		/* Get the pci_dev of our parent */
+		ppdev = pdev->bus->self;
+
+		/* Ouch, it's a host bridge... */
+		if (ppdev == NULL) {
+#ifdef CONFIG_PPC64
+			ppnode = pci_bus_to_OF_node(pdev->bus);
+#else
+			struct pci_controller *host;
+			host = pci_bus_to_host(pdev->bus);
+			ppnode = host ? host->arch_data : NULL;
+#endif
+			/* No node for host bridge ? give up */
+			if (ppnode == NULL)
+				return -EINVAL;
+		} else
+			/* We found a P2P bridge, check if it has a node */
+			ppnode = pci_device_to_OF_node(ppdev);
+
+		/* Ok, we have found a parent with a device-node, hand over to
+		 * the OF parsing code.
+		 * We build a unit address from the linux device to be used for
+		 * resolution. Note that we use the linux bus number which may
+		 * not match your firmware bus numbering.
+		 * Fortunately, in most cases, interrupt-map-mask doesn't include
+		 * the bus number as part of the matching.
+		 * You should still be careful about that though if you intend
+		 * to rely on this function (you ship  a firmware that doesn't
+		 * create device nodes for all PCI devices).
+		 */
+		if (ppnode)
+			break;
+
+		/* We can only get here if we hit a P2P bridge with no node,
+		 * let's do standard swizzling and try again
+		 */
+		lspec = of_irq_pci_swizzle(PCI_SLOT(pdev->devfn), lspec);
+		pdev = ppdev;
+	}
+
+	laddr[0] = (pdev->bus->number << 16)
+		| (pdev->devfn << 8);
+	laddr[1]  = laddr[2] = 0;
+	return of_irq_map_raw(ppnode, &lspec, laddr, out_irq);
+}
+EXPORT_SYMBOL_GPL(of_irq_map_pci);
+
diff --git a/include/asm-powerpc/prom.h b/include/asm-powerpc/prom.h
index b0768f474809..48bef401bc19 100644
--- a/include/asm-powerpc/prom.h
+++ b/include/asm-powerpc/prom.h
@@ -204,6 +204,15 @@ extern int release_OF_resource(struct device_node* node, int index);
  */
 
 
+/* Helper to read a big number */
+static inline u64 of_read_number(u32 *cell, int size)
+{
+	u64 r = 0;
+	while (size--)
+		r = (r << 32) | *(cell++);
+	return r;
+}
+
 /* Translate an OF address block into a CPU physical address
  */
 #define OF_BAD_ADDR	((u64)-1)
@@ -240,5 +249,83 @@ extern void kdump_move_device_tree(void);
 /* CPU OF node matching */
 struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 
+
+/*
+ * OF interrupt mapping
+ */
+
+/* This structure is returned when an interrupt is mapped. The controller
+ * field needs to be put() after use
+ */
+
+#define OF_MAX_IRQ_SPEC		 4 /* We handle specifiers of at most 4 cells */
+
+struct of_irq {
+	struct device_node *controller;	/* Interrupt controller node */
+	u32 size;			/* Specifier size */
+	u32 specifier[OF_MAX_IRQ_SPEC];	/* Specifier copy */
+};
+
+/***
+ * of_irq_map_init - Initialize the irq remapper
+ * @flags:	flags defining workarounds to enable
+ *
+ * Some machines have bugs in the device-tree which require certain workarounds
+ * to be applied. Call this before any interrupt mapping attempts to enable
+ * those workarounds.
+ */
+#define OF_IMAP_OLDWORLD_MAC	0x00000001
+#define OF_IMAP_NO_PHANDLE	0x00000002
+
+extern void of_irq_map_init(unsigned int flags);
+
+/***
+ * of_irq_map_raw - Low level interrupt tree parsing
+ * @parent:	the device interrupt parent
+ * @intspec:	interrupt specifier ("interrupts" property of the device)
+ * @addr:	address specifier (start of "reg" property of the device)
+ * @out_irq:	structure of_irq filled by this function
+ *
+ * Returns 0 on success and a negative number on error
+ *
+ * This function is a low-level interrupt tree walking function. It
+ * can be used to do a partial walk with synthetized reg and interrupts
+ * properties, for example when resolving PCI interrupts when no device
+ * node exist for the parent.
+ *
+ */
+
+extern int of_irq_map_raw(struct device_node *parent, u32 *intspec, u32 *addr,
+			  struct of_irq *out_irq);
+
+
+/***
+ * of_irq_map_one - Resolve an interrupt for a device
+ * @device:	the device whose interrupt is to be resolved
+ * @index:     	index of the interrupt to resolve
+ * @out_irq:	structure of_irq filled by this function
+ *
+ * This function resolves an interrupt, walking the tree, for a given
+ * device-tree node. It's the high level pendant to of_irq_map_raw().
+ * It also implements the workarounds for OldWolrd Macs.
+ */
+extern int of_irq_map_one(struct device_node *device, int index,
+			  struct of_irq *out_irq);
+
+/***
+ * of_irq_map_pci - Resolve the interrupt for a PCI device
+ * @pdev:	the device whose interrupt is to be resolved
+ * @out_irq:	structure of_irq filled by this function
+ *
+ * This function resolves the PCI interrupt for a given PCI device. If a
+ * device-node exists for a given pci_dev, it will use normal OF tree
+ * walking. If not, it will implement standard swizzling and walk up the
+ * PCI tree until an device-node is found, at which point it will finish
+ * resolving using the OF tree walking.
+ */
+struct pci_dev;
+extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
+
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
-- 
cgit v1.2.3


From 0ebfff1491ef85d41ddf9c633834838be144f69f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 3 Jul 2006 21:36:01 +1000
Subject: [POWERPC] Add new interrupt mapping core and change platforms to use
 it

This adds the new irq remapper core and removes the old one.  Because
there are some fundamental conflicts with the old code, like the value
of NO_IRQ which I'm now setting to 0 (as per discussions with Linus),
etc..., this commit also changes the relevant platform and driver code
over to use the new remapper (so as not to cause difficulties later
in bisecting).

This patch removes the old pre-parsing of the open firmware interrupt
tree along with all the bogus assumptions it made to try to renumber
interrupts according to the platform. This is all to be handled by the
new code now.

For the pSeries XICS interrupt controller, a single remapper host is
created for the whole machine regardless of how many interrupt
presentation and source controllers are found, and it's set to match
any device node that isn't a 8259.  That works fine on pSeries and
avoids having to deal with some of the complexities of split source
controllers vs. presentation controllers in the pSeries device trees.

The powerpc i8259 PIC driver now always requests the legacy interrupt
range. It also has the feature of being able to match any device node
(including NULL) if passed no device node as an input. That will help
porting over platforms with broken device-trees like Pegasos who don't
have a proper interrupt tree.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/ibmebus.c                |   9 +-
 arch/powerpc/kernel/irq.c                    | 629 +++++++++++++++++++++------
 arch/powerpc/kernel/legacy_serial.c          |  46 +-
 arch/powerpc/kernel/pci_32.c                 |  37 ++
 arch/powerpc/kernel/pci_64.c                 |  33 +-
 arch/powerpc/kernel/prom.c                   | 451 +------------------
 arch/powerpc/kernel/rtas_pci.c               |  17 -
 arch/powerpc/kernel/setup_32.c               |   1 -
 arch/powerpc/kernel/setup_64.c               |  17 +-
 arch/powerpc/kernel/vio.c                    |  12 +-
 arch/powerpc/platforms/cell/interrupt.c      | 396 ++++++++---------
 arch/powerpc/platforms/cell/interrupt.h      |  17 +-
 arch/powerpc/platforms/cell/setup.c          |  13 +-
 arch/powerpc/platforms/cell/spider-pic.c     | 341 +++++++++++----
 arch/powerpc/platforms/cell/spu_base.c       | 119 +++--
 arch/powerpc/platforms/chrp/pci.c            |  11 +-
 arch/powerpc/platforms/chrp/setup.c          | 110 +++--
 arch/powerpc/platforms/chrp/smp.c            |   1 -
 arch/powerpc/platforms/iseries/irq.c         |  92 ++--
 arch/powerpc/platforms/iseries/irq.h         |   2 +-
 arch/powerpc/platforms/iseries/setup.c       |   8 -
 arch/powerpc/platforms/maple/pci.c           |  17 +-
 arch/powerpc/platforms/maple/setup.c         |  88 ++--
 arch/powerpc/platforms/powermac/bootx_init.c |   4 +-
 arch/powerpc/platforms/powermac/low_i2c.c    |   9 +-
 arch/powerpc/platforms/powermac/nvram.c      |   5 +-
 arch/powerpc/platforms/powermac/pci.c        |  68 ++-
 arch/powerpc/platforms/powermac/pfunc_base.c |  13 +-
 arch/powerpc/platforms/powermac/pic.c        | 328 ++++++--------
 arch/powerpc/platforms/powermac/pmac.h       |   2 +
 arch/powerpc/platforms/powermac/setup.c      |   3 -
 arch/powerpc/platforms/pseries/ras.c         |  82 ++--
 arch/powerpc/platforms/pseries/setup.c       | 246 ++++++-----
 arch/powerpc/platforms/pseries/smp.c         |  32 +-
 arch/powerpc/platforms/pseries/xics.c        | 485 +++++++++++++--------
 arch/powerpc/platforms/pseries/xics.h        |   2 +-
 arch/powerpc/sysdev/i8259.c                  | 141 ++++--
 arch/powerpc/sysdev/mpic.c                   | 333 +++++++++-----
 drivers/char/hvsi.c                          |   7 +-
 drivers/macintosh/macio-adb.c                |  19 +-
 drivers/macintosh/macio_asic.c               | 152 ++++---
 drivers/macintosh/smu.c                      |   6 +-
 drivers/macintosh/via-cuda.c                 |  24 +-
 drivers/macintosh/via-pmu.c                  |  33 +-
 drivers/net/mace.c                           |   4 +-
 drivers/serial/pmac_zilog.c                  |   6 +-
 include/asm-powerpc/i8259.h                  |   5 +
 include/asm-powerpc/irq.h                    | 356 ++++++++++++---
 include/asm-powerpc/machdep.h                |   2 +-
 include/asm-powerpc/mpic.h                   |  42 +-
 include/asm-powerpc/prom.h                   |   7 -
 include/asm-powerpc/spu.h                    |   1 +
 sound/aoa/core/snd-aoa-gpio-feature.c        |   7 +-
 sound/aoa/soundbus/i2sbus/i2sbus-core.c      |   7 +-
 sound/oss/dmasound/dmasound_awacs.c          |  16 +-
 sound/ppc/pmac.c                             |  33 +-
 sound/ppc/tumbler.c                          |   8 +-
 57 files changed, 2829 insertions(+), 2126 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c
index e47d40ac6f39..97ddc02a3d42 100644
--- a/arch/powerpc/kernel/ibmebus.c
+++ b/arch/powerpc/kernel/ibmebus.c
@@ -323,13 +323,11 @@ int ibmebus_request_irq(struct ibmebus_dev *dev,
 			unsigned long irq_flags, const char * devname,
 			void *dev_id)
 {
-	unsigned int irq = virt_irq_create_mapping(ist);
+	unsigned int irq = irq_create_mapping(NULL, ist, 0);
 	
 	if (irq == NO_IRQ)
 		return -EINVAL;
 	
-	irq = irq_offset_up(irq);
-	
 	return request_irq(irq, handler,
 			   irq_flags, devname, dev_id);
 }
@@ -337,12 +335,9 @@ EXPORT_SYMBOL(ibmebus_request_irq);
 
 void ibmebus_free_irq(struct ibmebus_dev *dev, u32 ist, void *dev_id)
 {
-	unsigned int irq = virt_irq_create_mapping(ist);
+	unsigned int irq = irq_find_mapping(NULL, ist);
 	
-	irq = irq_offset_up(irq);
 	free_irq(irq, dev_id);
-	
-	return;
 }
 EXPORT_SYMBOL(ibmebus_free_irq);
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 91248559099a..05a700940f67 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -29,6 +29,8 @@
  * to reduce code space and undefined function references.
  */
 
+#undef DEBUG
+
 #include <linux/module.h>
 #include <linux/threads.h>
 #include <linux/kernel_stat.h>
@@ -46,7 +48,10 @@
 #include <linux/cpumask.h>
 #include <linux/profile.h>
 #include <linux/bitops.h>
-#include <linux/pci.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/mutex.h>
+#include <linux/bootmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -57,6 +62,7 @@
 #include <asm/prom.h>
 #include <asm/ptrace.h>
 #include <asm/machdep.h>
+#include <asm/udbg.h>
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/paca.h>
 #endif
@@ -88,7 +94,6 @@ extern atomic_t ipi_sent;
 EXPORT_SYMBOL(irq_desc);
 
 int distribute_irqs = 1;
-u64 ppc64_interrupt_controller;
 #endif /* CONFIG_PPC64 */
 
 int show_interrupts(struct seq_file *p, void *v)
@@ -181,7 +186,7 @@ void fixup_irqs(cpumask_t map)
 
 void do_IRQ(struct pt_regs *regs)
 {
-	int irq;
+	unsigned int irq;
 #ifdef CONFIG_IRQSTACKS
 	struct thread_info *curtp, *irqtp;
 #endif
@@ -212,7 +217,7 @@ void do_IRQ(struct pt_regs *regs)
 	 */
 	irq = ppc_md.get_irq(regs);
 
-	if (irq >= 0) {
+	if (irq != NO_IRQ && irq != NO_IRQ_IGNORE) {
 #ifdef CONFIG_IRQSTACKS
 		/* Switch to the irq stack to handle this */
 		curtp = current_thread_info();
@@ -231,7 +236,7 @@ void do_IRQ(struct pt_regs *regs)
 		} else
 #endif
 			generic_handle_irq(irq, regs);
-	} else if (irq != -2)
+	} else if (irq != NO_IRQ_IGNORE)
 		/* That's not SMP safe ... but who cares ? */
 		ppc_spurious_interrupts++;
 
@@ -254,123 +259,6 @@ void __init init_IRQ(void)
 #endif
 }
 
-#ifdef CONFIG_PPC64
-/*
- * Virtual IRQ mapping code, used on systems with XICS interrupt controllers.
- */
-
-#define UNDEFINED_IRQ 0xffffffff
-unsigned int virt_irq_to_real_map[NR_IRQS];
-
-/*
- * Don't use virtual irqs 0, 1, 2 for devices.
- * The pcnet32 driver considers interrupt numbers < 2 to be invalid,
- * and 2 is the XICS IPI interrupt.
- * We limit virtual irqs to __irq_offet_value less than virt_irq_max so
- * that when we offset them we don't end up with an interrupt
- * number >= virt_irq_max.
- */
-#define MIN_VIRT_IRQ	3
-
-unsigned int virt_irq_max;
-static unsigned int max_virt_irq;
-static unsigned int nr_virt_irqs;
-
-void
-virt_irq_init(void)
-{
-	int i;
-
-	if ((virt_irq_max == 0) || (virt_irq_max > (NR_IRQS - 1)))
-		virt_irq_max = NR_IRQS - 1;
-	max_virt_irq = virt_irq_max - __irq_offset_value;
-	nr_virt_irqs = max_virt_irq - MIN_VIRT_IRQ + 1;
-
-	for (i = 0; i < NR_IRQS; i++)
-		virt_irq_to_real_map[i] = UNDEFINED_IRQ;
-}
-
-/* Create a mapping for a real_irq if it doesn't already exist.
- * Return the virtual irq as a convenience.
- */
-int virt_irq_create_mapping(unsigned int real_irq)
-{
-	unsigned int virq, first_virq;
-	static int warned;
-
-	if (ppc64_interrupt_controller == IC_OPEN_PIC)
-		return real_irq;	/* no mapping for openpic (for now) */
-
-	if (ppc64_interrupt_controller == IC_CELL_PIC)
-		return real_irq;	/* no mapping for iic either */
-
-	/* don't map interrupts < MIN_VIRT_IRQ */
-	if (real_irq < MIN_VIRT_IRQ) {
-		virt_irq_to_real_map[real_irq] = real_irq;
-		return real_irq;
-	}
-
-	/* map to a number between MIN_VIRT_IRQ and max_virt_irq */
-	virq = real_irq;
-	if (virq > max_virt_irq)
-		virq = (virq % nr_virt_irqs) + MIN_VIRT_IRQ;
-
-	/* search for this number or a free slot */
-	first_virq = virq;
-	while (virt_irq_to_real_map[virq] != UNDEFINED_IRQ) {
-		if (virt_irq_to_real_map[virq] == real_irq)
-			return virq;
-		if (++virq > max_virt_irq)
-			virq = MIN_VIRT_IRQ;
-		if (virq == first_virq)
-			goto nospace;	/* oops, no free slots */
-	}
-
-	virt_irq_to_real_map[virq] = real_irq;
-	return virq;
-
- nospace:
-	if (!warned) {
-		printk(KERN_CRIT "Interrupt table is full\n");
-		printk(KERN_CRIT "Increase virt_irq_max (currently %d) "
-		       "in your kernel sources and rebuild.\n", virt_irq_max);
-		warned = 1;
-	}
-	return NO_IRQ;
-}
-
-/*
- * In most cases will get a hit on the very first slot checked in the
- * virt_irq_to_real_map.  Only when there are a large number of
- * IRQs will this be expensive.
- */
-unsigned int real_irq_to_virt_slowpath(unsigned int real_irq)
-{
-	unsigned int virq;
-	unsigned int first_virq;
-
-	virq = real_irq;
-
-	if (virq > max_virt_irq)
-		virq = (virq % nr_virt_irqs) + MIN_VIRT_IRQ;
-
-	first_virq = virq;
-
-	do {
-		if (virt_irq_to_real_map[virq] == real_irq)
-			return virq;
-
-		virq++;
-
-		if (virq >= max_virt_irq)
-			virq = 0;
-
-	} while (first_virq != virq);
-
-	return NO_IRQ;
-
-}
-#endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_IRQSTACKS
 struct thread_info *softirq_ctx[NR_CPUS] __read_mostly;
@@ -430,6 +318,503 @@ void do_softirq(void)
 }
 EXPORT_SYMBOL(do_softirq);
 
+
+/*
+ * IRQ controller and virtual interrupts
+ */
+
+#ifdef CONFIG_PPC_MERGE
+
+static LIST_HEAD(irq_hosts);
+static spinlock_t irq_big_lock = SPIN_LOCK_UNLOCKED;
+
+struct irq_map_entry irq_map[NR_IRQS];
+static unsigned int irq_virq_count = NR_IRQS;
+static struct irq_host *irq_default_host;
+
+struct irq_host *irq_alloc_host(unsigned int revmap_type,
+				unsigned int revmap_arg,
+				struct irq_host_ops *ops,
+				irq_hw_number_t inval_irq)
+{
+	struct irq_host *host;
+	unsigned int size = sizeof(struct irq_host);
+	unsigned int i;
+	unsigned int *rmap;
+	unsigned long flags;
+
+	/* Allocate structure and revmap table if using linear mapping */
+	if (revmap_type == IRQ_HOST_MAP_LINEAR)
+		size += revmap_arg * sizeof(unsigned int);
+	if (mem_init_done)
+		host = kzalloc(size, GFP_KERNEL);
+	else {
+		host = alloc_bootmem(size);
+		if (host)
+			memset(host, 0, size);
+	}
+	if (host == NULL)
+		return NULL;
+
+	/* Fill structure */
+	host->revmap_type = revmap_type;
+	host->inval_irq = inval_irq;
+	host->ops = ops;
+
+	spin_lock_irqsave(&irq_big_lock, flags);
+
+	/* If it's a legacy controller, check for duplicates and
+	 * mark it as allocated (we use irq 0 host pointer for that
+	 */
+	if (revmap_type == IRQ_HOST_MAP_LEGACY) {
+		if (irq_map[0].host != NULL) {
+			spin_unlock_irqrestore(&irq_big_lock, flags);
+			/* If we are early boot, we can't free the structure,
+			 * too bad...
+			 * this will be fixed once slab is made available early
+			 * instead of the current cruft
+			 */
+			if (mem_init_done)
+				kfree(host);
+			return NULL;
+		}
+		irq_map[0].host = host;
+	}
+
+	list_add(&host->link, &irq_hosts);
+	spin_unlock_irqrestore(&irq_big_lock, flags);
+
+	/* Additional setups per revmap type */
+	switch(revmap_type) {
+	case IRQ_HOST_MAP_LEGACY:
+		/* 0 is always the invalid number for legacy */
+		host->inval_irq = 0;
+		/* setup us as the host for all legacy interrupts */
+		for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
+			irq_map[i].hwirq = 0;
+			smp_wmb();
+			irq_map[i].host = host;
+			smp_wmb();
+
+			/* Clear some flags */
+			get_irq_desc(i)->status
+				&= ~(IRQ_NOREQUEST | IRQ_LEVEL);
+
+			/* Legacy flags are left to default at this point,
+			 * one can then use irq_create_mapping() to
+			 * explicitely change them
+			 */
+			ops->map(host, i, i, 0);
+		}
+		break;
+	case IRQ_HOST_MAP_LINEAR:
+		rmap = (unsigned int *)(host + 1);
+		for (i = 0; i < revmap_arg; i++)
+			rmap[i] = IRQ_NONE;
+		host->revmap_data.linear.size = revmap_arg;
+		smp_wmb();
+		host->revmap_data.linear.revmap = rmap;
+		break;
+	default:
+		break;
+	}
+
+	pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host);
+
+	return host;
+}
+
+struct irq_host *irq_find_host(struct device_node *node)
+{
+	struct irq_host *h, *found = NULL;
+	unsigned long flags;
+
+	/* We might want to match the legacy controller last since
+	 * it might potentially be set to match all interrupts in
+	 * the absence of a device node. This isn't a problem so far
+	 * yet though...
+	 */
+	spin_lock_irqsave(&irq_big_lock, flags);
+	list_for_each_entry(h, &irq_hosts, link)
+		if (h->ops->match == NULL || h->ops->match(h, node)) {
+			found = h;
+			break;
+		}
+	spin_unlock_irqrestore(&irq_big_lock, flags);
+	return found;
+}
+EXPORT_SYMBOL_GPL(irq_find_host);
+
+void irq_set_default_host(struct irq_host *host)
+{
+	pr_debug("irq: Default host set to @0x%p\n", host);
+
+	irq_default_host = host;
+}
+
+void irq_set_virq_count(unsigned int count)
+{
+	pr_debug("irq: Trying to set virq count to %d\n", count);
+
+	BUG_ON(count < NUM_ISA_INTERRUPTS);
+	if (count < NR_IRQS)
+		irq_virq_count = count;
+}
+
+unsigned int irq_create_mapping(struct irq_host *host,
+				irq_hw_number_t hwirq,
+				unsigned int flags)
+{
+	unsigned int virq, hint;
+
+	pr_debug("irq: irq_create_mapping(0x%p, 0x%lx, 0x%x)\n",
+		 host, hwirq, flags);
+
+	/* Look for default host if nececssary */
+	if (host == NULL)
+		host = irq_default_host;
+	if (host == NULL) {
+		printk(KERN_WARNING "irq_create_mapping called for"
+		       " NULL host, hwirq=%lx\n", hwirq);
+		WARN_ON(1);
+		return NO_IRQ;
+	}
+	pr_debug("irq: -> using host @%p\n", host);
+
+	/* Check if mapping already exist, if it does, call
+	 * host->ops->map() to update the flags
+	 */
+	virq = irq_find_mapping(host, hwirq);
+	if (virq != IRQ_NONE) {
+		pr_debug("irq: -> existing mapping on virq %d\n", virq);
+		host->ops->map(host, virq, hwirq, flags);
+		return virq;
+	}
+
+	/* Get a virtual interrupt number */
+	if (host->revmap_type == IRQ_HOST_MAP_LEGACY) {
+		/* Handle legacy */
+		virq = (unsigned int)hwirq;
+		if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
+			return NO_IRQ;
+		return virq;
+	} else {
+		/* Allocate a virtual interrupt number */
+		hint = hwirq % irq_virq_count;
+		virq = irq_alloc_virt(host, 1, hint);
+		if (virq == NO_IRQ) {
+			pr_debug("irq: -> virq allocation failed\n");
+			return NO_IRQ;
+		}
+	}
+	pr_debug("irq: -> obtained virq %d\n", virq);
+
+	/* Clear some flags */
+	get_irq_desc(virq)->status &= ~(IRQ_NOREQUEST | IRQ_LEVEL);
+
+	/* map it */
+	if (host->ops->map(host, virq, hwirq, flags)) {
+		pr_debug("irq: -> mapping failed, freeing\n");
+		irq_free_virt(virq, 1);
+		return NO_IRQ;
+	}
+	smp_wmb();
+	irq_map[virq].hwirq = hwirq;
+	smp_mb();
+	return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_mapping);
+
+extern unsigned int irq_create_of_mapping(struct device_node *controller,
+					  u32 *intspec, unsigned int intsize)
+{
+	struct irq_host *host;
+	irq_hw_number_t hwirq;
+	unsigned int flags = IRQ_TYPE_NONE;
+
+	if (controller == NULL)
+		host = irq_default_host;
+	else
+		host = irq_find_host(controller);
+	if (host == NULL)
+		return NO_IRQ;
+
+	/* If host has no translation, then we assume interrupt line */
+	if (host->ops->xlate == NULL)
+		hwirq = intspec[0];
+	else {
+		if (host->ops->xlate(host, controller, intspec, intsize,
+				     &hwirq, &flags))
+			return NO_IRQ;
+	}
+
+	return irq_create_mapping(host, hwirq, flags);
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+
+unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
+{
+	struct of_irq oirq;
+
+	if (of_irq_map_one(dev, index, &oirq))
+		return NO_IRQ;
+
+	return irq_create_of_mapping(oirq.controller, oirq.specifier,
+				     oirq.size);
+}
+EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
+
+void irq_dispose_mapping(unsigned int virq)
+{
+	struct irq_host *host = irq_map[virq].host;
+	irq_hw_number_t hwirq;
+	unsigned long flags;
+
+	WARN_ON (host == NULL);
+	if (host == NULL)
+		return;
+
+	/* Never unmap legacy interrupts */
+	if (host->revmap_type == IRQ_HOST_MAP_LEGACY)
+		return;
+
+	/* remove chip and handler */
+	set_irq_chip_and_handler(virq, NULL, NULL);
+
+	/* Make sure it's completed */
+	synchronize_irq(virq);
+
+	/* Tell the PIC about it */
+	if (host->ops->unmap)
+		host->ops->unmap(host, virq);
+	smp_mb();
+
+	/* Clear reverse map */
+	hwirq = irq_map[virq].hwirq;
+	switch(host->revmap_type) {
+	case IRQ_HOST_MAP_LINEAR:
+		if (hwirq < host->revmap_data.linear.size)
+			host->revmap_data.linear.revmap[hwirq] = IRQ_NONE;
+		break;
+	case IRQ_HOST_MAP_TREE:
+		/* Check if radix tree allocated yet */
+		if (host->revmap_data.tree.gfp_mask == 0)
+			break;
+		/* XXX radix tree not safe ! remove lock whem it becomes safe
+		 * and use some RCU sync to make sure everything is ok before we
+		 * can re-use that map entry
+		 */
+		spin_lock_irqsave(&irq_big_lock, flags);
+		radix_tree_delete(&host->revmap_data.tree, hwirq);
+		spin_unlock_irqrestore(&irq_big_lock, flags);
+		break;
+	}
+
+	/* Destroy map */
+	smp_mb();
+	irq_map[virq].hwirq = host->inval_irq;
+
+	/* Set some flags */
+	get_irq_desc(virq)->status |= IRQ_NOREQUEST;
+
+	/* Free it */
+	irq_free_virt(virq, 1);
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+
+unsigned int irq_find_mapping(struct irq_host *host,
+			      irq_hw_number_t hwirq)
+{
+	unsigned int i;
+	unsigned int hint = hwirq % irq_virq_count;
+
+	/* Look for default host if nececssary */
+	if (host == NULL)
+		host = irq_default_host;
+	if (host == NULL)
+		return NO_IRQ;
+
+	/* legacy -> bail early */
+	if (host->revmap_type == IRQ_HOST_MAP_LEGACY)
+		return hwirq;
+
+	/* Slow path does a linear search of the map */
+	if (hint < NUM_ISA_INTERRUPTS)
+		hint = NUM_ISA_INTERRUPTS;
+	i = hint;
+	do  {
+		if (irq_map[i].host == host &&
+		    irq_map[i].hwirq == hwirq)
+			return i;
+		i++;
+		if (i >= irq_virq_count)
+			i = NUM_ISA_INTERRUPTS;
+	} while(i != hint);
+	return NO_IRQ;
+}
+EXPORT_SYMBOL_GPL(irq_find_mapping);
+
+
+unsigned int irq_radix_revmap(struct irq_host *host,
+			      irq_hw_number_t hwirq)
+{
+	struct radix_tree_root *tree;
+	struct irq_map_entry *ptr;
+	unsigned int virq;
+	unsigned long flags;
+
+	WARN_ON(host->revmap_type != IRQ_HOST_MAP_TREE);
+
+	/* Check if the radix tree exist yet. We test the value of
+	 * the gfp_mask for that. Sneaky but saves another int in the
+	 * structure. If not, we fallback to slow mode
+	 */
+	tree = &host->revmap_data.tree;
+	if (tree->gfp_mask == 0)
+		return irq_find_mapping(host, hwirq);
+
+	/* XXX Current radix trees are NOT SMP safe !!! Remove that lock
+	 * when that is fixed (when Nick's patch gets in
+	 */
+	spin_lock_irqsave(&irq_big_lock, flags);
+
+	/* Now try to resolve */
+	ptr = radix_tree_lookup(tree, hwirq);
+	/* Found it, return */
+	if (ptr) {
+		virq = ptr - irq_map;
+		goto bail;
+	}
+
+	/* If not there, try to insert it */
+	virq = irq_find_mapping(host, hwirq);
+	if (virq != NO_IRQ)
+		radix_tree_insert(tree, virq, &irq_map[virq]);
+ bail:
+	spin_unlock_irqrestore(&irq_big_lock, flags);
+	return virq;
+}
+
+unsigned int irq_linear_revmap(struct irq_host *host,
+			       irq_hw_number_t hwirq)
+{
+	unsigned int *revmap;
+
+	WARN_ON(host->revmap_type != IRQ_HOST_MAP_LINEAR);
+
+	/* Check revmap bounds */
+	if (unlikely(hwirq >= host->revmap_data.linear.size))
+		return irq_find_mapping(host, hwirq);
+
+	/* Check if revmap was allocated */
+	revmap = host->revmap_data.linear.revmap;
+	if (unlikely(revmap == NULL))
+		return irq_find_mapping(host, hwirq);
+
+	/* Fill up revmap with slow path if no mapping found */
+	if (unlikely(revmap[hwirq] == NO_IRQ))
+		revmap[hwirq] = irq_find_mapping(host, hwirq);
+
+	return revmap[hwirq];
+}
+
+unsigned int irq_alloc_virt(struct irq_host *host,
+			    unsigned int count,
+			    unsigned int hint)
+{
+	unsigned long flags;
+	unsigned int i, j, found = NO_IRQ;
+	unsigned int limit = irq_virq_count - count;
+
+	if (count == 0 || count > (irq_virq_count - NUM_ISA_INTERRUPTS))
+		return NO_IRQ;
+
+	spin_lock_irqsave(&irq_big_lock, flags);
+
+	/* Use hint for 1 interrupt if any */
+	if (count == 1 && hint >= NUM_ISA_INTERRUPTS &&
+	    hint < irq_virq_count && irq_map[hint].host == NULL) {
+		found = hint;
+		goto hint_found;
+	}
+
+	/* Look for count consecutive numbers in the allocatable
+	 * (non-legacy) space
+	 */
+	for (i = NUM_ISA_INTERRUPTS; i <= limit; ) {
+		for (j = i; j < (i + count); j++)
+			if (irq_map[j].host != NULL) {
+				i = j + 1;
+				continue;
+			}
+		found = i;
+		break;
+	}
+	if (found == NO_IRQ) {
+		spin_unlock_irqrestore(&irq_big_lock, flags);
+		return NO_IRQ;
+	}
+ hint_found:
+	for (i = found; i < (found + count); i++) {
+		irq_map[i].hwirq = host->inval_irq;
+		smp_wmb();
+		irq_map[i].host = host;
+	}
+	spin_unlock_irqrestore(&irq_big_lock, flags);
+	return found;
+}
+
+void irq_free_virt(unsigned int virq, unsigned int count)
+{
+	unsigned long flags;
+	unsigned int i;
+
+	WARN_ON (virq < NUM_ISA_INTERRUPTS);
+	WARN_ON (count == 0 || (virq + count) > irq_virq_count);
+
+	spin_lock_irqsave(&irq_big_lock, flags);
+	for (i = virq; i < (virq + count); i++) {
+		struct irq_host *host;
+
+		if (i < NUM_ISA_INTERRUPTS ||
+		    (virq + count) > irq_virq_count)
+			continue;
+
+		host = irq_map[i].host;
+		irq_map[i].hwirq = host->inval_irq;
+		smp_wmb();
+		irq_map[i].host = NULL;
+	}
+	spin_unlock_irqrestore(&irq_big_lock, flags);
+}
+
+void irq_early_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		get_irq_desc(i)->status |= IRQ_NOREQUEST;
+}
+
+/* We need to create the radix trees late */
+static int irq_late_init(void)
+{
+	struct irq_host *h;
+	unsigned long flags;
+
+	spin_lock_irqsave(&irq_big_lock, flags);
+	list_for_each_entry(h, &irq_hosts, link) {
+		if (h->revmap_type == IRQ_HOST_MAP_TREE)
+			INIT_RADIX_TREE(&h->revmap_data.tree, GFP_ATOMIC);
+	}
+	spin_unlock_irqrestore(&irq_big_lock, flags);
+
+	return 0;
+}
+arch_initcall(irq_late_init);
+
+#endif /* CONFIG_PPC_MERGE */
+
 #ifdef CONFIG_PCI_MSI
 int pci_enable_msi(struct pci_dev * pdev)
 {
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index a55056676ca4..7e98e778b52f 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -28,6 +28,7 @@ static struct legacy_serial_info {
 	struct device_node		*np;
 	unsigned int			speed;
 	unsigned int			clock;
+	int				irq_check_parent;
 	phys_addr_t			taddr;
 } legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS];
 static unsigned int legacy_serial_count;
@@ -36,7 +37,7 @@ static int legacy_serial_console = -1;
 static int __init add_legacy_port(struct device_node *np, int want_index,
 				  int iotype, phys_addr_t base,
 				  phys_addr_t taddr, unsigned long irq,
-				  upf_t flags)
+				  upf_t flags, int irq_check_parent)
 {
 	u32 *clk, *spd, clock = BASE_BAUD * 16;
 	int index;
@@ -68,7 +69,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
 	if (legacy_serial_infos[index].np != 0) {
 		/* if we still have some room, move it, else override */
 		if (legacy_serial_count < MAX_LEGACY_SERIAL_PORTS) {
-			printk(KERN_INFO "Moved legacy port %d -> %d\n",
+			printk(KERN_DEBUG "Moved legacy port %d -> %d\n",
 			       index, legacy_serial_count);
 			legacy_serial_ports[legacy_serial_count] =
 				legacy_serial_ports[index];
@@ -76,7 +77,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
 				legacy_serial_infos[index];
 			legacy_serial_count++;
 		} else {
-			printk(KERN_INFO "Replacing legacy port %d\n", index);
+			printk(KERN_DEBUG "Replacing legacy port %d\n", index);
 		}
 	}
 
@@ -95,10 +96,11 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
 	legacy_serial_infos[index].np = of_node_get(np);
 	legacy_serial_infos[index].clock = clock;
 	legacy_serial_infos[index].speed = spd ? *spd : 0;
+	legacy_serial_infos[index].irq_check_parent = irq_check_parent;
 
-	printk(KERN_INFO "Found legacy serial port %d for %s\n",
+	printk(KERN_DEBUG "Found legacy serial port %d for %s\n",
 	       index, np->full_name);
-	printk(KERN_INFO "  %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n",
+	printk(KERN_DEBUG "  %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n",
 	       (iotype == UPIO_PORT) ? "port" : "mem",
 	       (unsigned long long)base, (unsigned long long)taddr, irq,
 	       legacy_serial_ports[index].uartclk,
@@ -132,7 +134,7 @@ static int __init add_legacy_soc_port(struct device_node *np,
 	/* Add port, irq will be dealt with later. We passed a translated
 	 * IO port value. It will be fixed up later along with the irq
 	 */
-	return add_legacy_port(np, -1, UPIO_MEM, addr, addr, NO_IRQ, flags);
+	return add_legacy_port(np, -1, UPIO_MEM, addr, addr, NO_IRQ, flags, 0);
 }
 
 static int __init add_legacy_isa_port(struct device_node *np,
@@ -170,7 +172,7 @@ static int __init add_legacy_isa_port(struct device_node *np,
 
 	/* Add port, irq will be dealt with later */
 	return add_legacy_port(np, index, UPIO_PORT, reg[1], taddr,
-			       NO_IRQ, UPF_BOOT_AUTOCONF);
+			       NO_IRQ, UPF_BOOT_AUTOCONF, 0);
 
 }
 
@@ -242,7 +244,8 @@ static int __init add_legacy_pci_port(struct device_node *np,
 	/* Add port, irq will be dealt with later. We passed a translated
 	 * IO port value. It will be fixed up later along with the irq
 	 */
-	return add_legacy_port(np, index, iotype, base, addr, NO_IRQ, UPF_BOOT_AUTOCONF);
+	return add_legacy_port(np, index, iotype, base, addr, NO_IRQ,
+			       UPF_BOOT_AUTOCONF, np != pci_dev);
 }
 #endif
 
@@ -373,27 +376,22 @@ static void __init fixup_port_irq(int index,
 				  struct device_node *np,
 				  struct plat_serial8250_port *port)
 {
+	unsigned int virq;
+
 	DBG("fixup_port_irq(%d)\n", index);
 
-	/* Check for interrupts in that node */
-	if (np->n_intrs > 0) {
-		port->irq = np->intrs[0].line;
-		DBG(" port %d (%s), irq=%d\n",
-		    index, np->full_name, port->irq);
-		return;
+	virq = irq_of_parse_and_map(np, 0);
+	if (virq == NO_IRQ && legacy_serial_infos[index].irq_check_parent) {
+		np = of_get_parent(np);
+		if (np == NULL)
+			return;
+		virq = irq_of_parse_and_map(np, 0);
+		of_node_put(np);
 	}
-
-	/* Check for interrupts in the parent */
-	np = of_get_parent(np);
-	if (np == NULL)
+	if (virq == NO_IRQ)
 		return;
 
-	if (np->n_intrs > 0) {
-		port->irq = np->intrs[0].line;
-		DBG(" port %d (%s), irq=%d\n",
-		    index, np->full_name, port->irq);
-	}
-	of_node_put(np);
+	port->irq = virq;
 }
 
 static void __init fixup_port_pio(int index,
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index 1333335c474e..898dae8ab6d9 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -1404,6 +1404,43 @@ pcibios_update_irq(struct pci_dev *dev, int irq)
 	/* XXX FIXME - update OF device tree node interrupt property */
 }
 
+#ifdef CONFIG_PPC_MERGE
+/* XXX This is a copy of the ppc64 version. This is temporary until we start
+ * merging the 2 PCI layers
+ */
+/*
+ * Reads the interrupt pin to determine if interrupt is use by card.
+ * If the interrupt is used, then gets the interrupt line from the
+ * openfirmware and sets it in the pci_dev and pci_config line.
+ */
+int pci_read_irq_line(struct pci_dev *pci_dev)
+{
+	struct of_irq oirq;
+	unsigned int virq;
+
+	DBG("Try to map irq for %s...\n", pci_name(pci_dev));
+
+	if (of_irq_map_pci(pci_dev, &oirq)) {
+		DBG(" -> failed !\n");
+		return -1;
+	}
+
+	DBG(" -> got one, spec %d cells (0x%08x...) on %s\n",
+	    oirq.size, oirq.specifier[0], oirq.controller->full_name);
+
+	virq = irq_create_of_mapping(oirq.controller, oirq.specifier, oirq.size);
+	if(virq == NO_IRQ) {
+		DBG(" -> failed to map !\n");
+		return -1;
+	}
+	pci_dev->irq = virq;
+	pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, virq);
+
+	return 0;
+}
+EXPORT_SYMBOL(pci_read_irq_line);
+#endif /* CONFIG_PPC_MERGE */
+
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index bea8451fb57b..efc0b5559ee0 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -398,12 +398,8 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 	} else {
 		dev->hdr_type = PCI_HEADER_TYPE_NORMAL;
 		dev->rom_base_reg = PCI_ROM_ADDRESS;
+		/* Maybe do a default OF mapping here */
 		dev->irq = NO_IRQ;
-		if (node->n_intrs > 0) {
-			dev->irq = node->intrs[0].line;
-			pci_write_config_byte(dev, PCI_INTERRUPT_LINE,
-					      dev->irq);
-		}
 	}
 
 	pci_parse_of_addrs(node, dev);
@@ -1288,23 +1284,26 @@ EXPORT_SYMBOL(pcibios_fixup_bus);
  */
 int pci_read_irq_line(struct pci_dev *pci_dev)
 {
-	u8 intpin;
-	struct device_node *node;
-
-    	pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &intpin);
-	if (intpin == 0)
-		return 0;
+	struct of_irq oirq;
+	unsigned int virq;
 
-	node = pci_device_to_OF_node(pci_dev);
-	if (node == NULL)
-		return -1;
+	DBG("Try to map irq for %s...\n", pci_name(pci_dev));
 
-	if (node->n_intrs == 0)
+	if (of_irq_map_pci(pci_dev, &oirq)) {
+		DBG(" -> failed !\n");
 		return -1;
+	}
 
-	pci_dev->irq = node->intrs[0].line;
+	DBG(" -> got one, spec %d cells (0x%08x...) on %s\n",
+	    oirq.size, oirq.specifier[0], oirq.controller->full_name);
 
-	pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, pci_dev->irq);
+	virq = irq_create_of_mapping(oirq.controller, oirq.specifier, oirq.size);
+	if(virq == NO_IRQ) {
+		DBG(" -> failed to map !\n");
+		return -1;
+	}
+	pci_dev->irq = virq;
+	pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, virq);
 
 	return 0;
 }
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index ef3619c28702..a1787ffb6319 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/kexec.h>
 #include <linux/debugfs.h>
+#include <linux/irq.h>
 
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -86,424 +87,6 @@ static DEFINE_RWLOCK(devtree_lock);
 /* export that to outside world */
 struct device_node *of_chosen;
 
-struct device_node *dflt_interrupt_controller;
-int num_interrupt_controllers;
-
-/*
- * Wrapper for allocating memory for various data that needs to be
- * attached to device nodes as they are processed at boot or when
- * added to the device tree later (e.g. DLPAR).  At boot there is
- * already a region reserved so we just increment *mem_start by size;
- * otherwise we call kmalloc.
- */
-static void * prom_alloc(unsigned long size, unsigned long *mem_start)
-{
-	unsigned long tmp;
-
-	if (!mem_start)
-		return kmalloc(size, GFP_KERNEL);
-
-	tmp = *mem_start;
-	*mem_start += size;
-	return (void *)tmp;
-}
-
-/*
- * Find the device_node with a given phandle.
- */
-static struct device_node * find_phandle(phandle ph)
-{
-	struct device_node *np;
-
-	for (np = allnodes; np != 0; np = np->allnext)
-		if (np->linux_phandle == ph)
-			return np;
-	return NULL;
-}
-
-/*
- * Find the interrupt parent of a node.
- */
-static struct device_node * __devinit intr_parent(struct device_node *p)
-{
-	phandle *parp;
-
-	parp = (phandle *) get_property(p, "interrupt-parent", NULL);
-	if (parp == NULL)
-		return p->parent;
-	p = find_phandle(*parp);
-	if (p != NULL)
-		return p;
-	/*
-	 * On a powermac booted with BootX, we don't get to know the
-	 * phandles for any nodes, so find_phandle will return NULL.
-	 * Fortunately these machines only have one interrupt controller
-	 * so there isn't in fact any ambiguity.  -- paulus
-	 */
-	if (num_interrupt_controllers == 1)
-		p = dflt_interrupt_controller;
-	return p;
-}
-
-/*
- * Find out the size of each entry of the interrupts property
- * for a node.
- */
-int __devinit prom_n_intr_cells(struct device_node *np)
-{
-	struct device_node *p;
-	unsigned int *icp;
-
-	for (p = np; (p = intr_parent(p)) != NULL; ) {
-		icp = (unsigned int *)
-			get_property(p, "#interrupt-cells", NULL);
-		if (icp != NULL)
-			return *icp;
-		if (get_property(p, "interrupt-controller", NULL) != NULL
-		    || get_property(p, "interrupt-map", NULL) != NULL) {
-			printk("oops, node %s doesn't have #interrupt-cells\n",
-			       p->full_name);
-			return 1;
-		}
-	}
-#ifdef DEBUG_IRQ
-	printk("prom_n_intr_cells failed for %s\n", np->full_name);
-#endif
-	return 1;
-}
-
-/*
- * Map an interrupt from a device up to the platform interrupt
- * descriptor.
- */
-static int __devinit map_interrupt(unsigned int **irq, struct device_node **ictrler,
-				   struct device_node *np, unsigned int *ints,
-				   int nintrc)
-{
-	struct device_node *p, *ipar;
-	unsigned int *imap, *imask, *ip;
-	int i, imaplen, match;
-	int newintrc = 0, newaddrc = 0;
-	unsigned int *reg;
-	int naddrc;
-
-	reg = (unsigned int *) get_property(np, "reg", NULL);
-	naddrc = prom_n_addr_cells(np);
-	p = intr_parent(np);
-	while (p != NULL) {
-		if (get_property(p, "interrupt-controller", NULL) != NULL)
-			/* this node is an interrupt controller, stop here */
-			break;
-		imap = (unsigned int *)
-			get_property(p, "interrupt-map", &imaplen);
-		if (imap == NULL) {
-			p = intr_parent(p);
-			continue;
-		}
-		imask = (unsigned int *)
-			get_property(p, "interrupt-map-mask", NULL);
-		if (imask == NULL) {
-			printk("oops, %s has interrupt-map but no mask\n",
-			       p->full_name);
-			return 0;
-		}
-		imaplen /= sizeof(unsigned int);
-		match = 0;
-		ipar = NULL;
-		while (imaplen > 0 && !match) {
-			/* check the child-interrupt field */
-			match = 1;
-			for (i = 0; i < naddrc && match; ++i)
-				match = ((reg[i] ^ imap[i]) & imask[i]) == 0;
-			for (; i < naddrc + nintrc && match; ++i)
-				match = ((ints[i-naddrc] ^ imap[i]) & imask[i]) == 0;
-			imap += naddrc + nintrc;
-			imaplen -= naddrc + nintrc;
-			/* grab the interrupt parent */
-			ipar = find_phandle((phandle) *imap++);
-			--imaplen;
-			if (ipar == NULL && num_interrupt_controllers == 1)
-				/* cope with BootX not giving us phandles */
-				ipar = dflt_interrupt_controller;
-			if (ipar == NULL) {
-				printk("oops, no int parent %x in map of %s\n",
-				       imap[-1], p->full_name);
-				return 0;
-			}
-			/* find the parent's # addr and intr cells */
-			ip = (unsigned int *)
-				get_property(ipar, "#interrupt-cells", NULL);
-			if (ip == NULL) {
-				printk("oops, no #interrupt-cells on %s\n",
-				       ipar->full_name);
-				return 0;
-			}
-			newintrc = *ip;
-			ip = (unsigned int *)
-				get_property(ipar, "#address-cells", NULL);
-			newaddrc = (ip == NULL)? 0: *ip;
-			imap += newaddrc + newintrc;
-			imaplen -= newaddrc + newintrc;
-		}
-		if (imaplen < 0) {
-			printk("oops, error decoding int-map on %s, len=%d\n",
-			       p->full_name, imaplen);
-			return 0;
-		}
-		if (!match) {
-#ifdef DEBUG_IRQ
-			printk("oops, no match in %s int-map for %s\n",
-			       p->full_name, np->full_name);
-#endif
-			return 0;
-		}
-		p = ipar;
-		naddrc = newaddrc;
-		nintrc = newintrc;
-		ints = imap - nintrc;
-		reg = ints - naddrc;
-	}
-	if (p == NULL) {
-#ifdef DEBUG_IRQ
-		printk("hmmm, int tree for %s doesn't have ctrler\n",
-		       np->full_name);
-#endif
-		return 0;
-	}
-	*irq = ints;
-	*ictrler = p;
-	return nintrc;
-}
-
-static unsigned char map_isa_senses[4] = {
-	IRQ_SENSE_LEVEL | IRQ_POLARITY_NEGATIVE,
-	IRQ_SENSE_LEVEL | IRQ_POLARITY_POSITIVE,
-	IRQ_SENSE_EDGE  | IRQ_POLARITY_NEGATIVE,
-	IRQ_SENSE_EDGE  | IRQ_POLARITY_POSITIVE
-};
-
-static unsigned char map_mpic_senses[4] = {
-	IRQ_SENSE_EDGE  | IRQ_POLARITY_POSITIVE,
-	IRQ_SENSE_LEVEL | IRQ_POLARITY_NEGATIVE,
-	/* 2 seems to be used for the 8259 cascade... */
-	IRQ_SENSE_LEVEL | IRQ_POLARITY_POSITIVE,
-	IRQ_SENSE_EDGE  | IRQ_POLARITY_NEGATIVE,
-};
-
-static int __devinit finish_node_interrupts(struct device_node *np,
-					    unsigned long *mem_start,
-					    int measure_only)
-{
-	unsigned int *ints;
-	int intlen, intrcells, intrcount;
-	int i, j, n, sense;
-	unsigned int *irq, virq;
-	struct device_node *ic;
-	int trace = 0;
-
-	//#define TRACE(fmt...) do { if (trace) { printk(fmt); mdelay(1000); } } while(0)
-#define TRACE(fmt...)
-
-	if (!strcmp(np->name, "smu-doorbell"))
-		trace = 1;
-
-	TRACE("Finishing SMU doorbell ! num_interrupt_controllers = %d\n",
-	      num_interrupt_controllers);
-
-	if (num_interrupt_controllers == 0) {
-		/*
-		 * Old machines just have a list of interrupt numbers
-		 * and no interrupt-controller nodes.
-		 */
-		ints = (unsigned int *) get_property(np, "AAPL,interrupts",
-						     &intlen);
-		/* XXX old interpret_pci_props looked in parent too */
-		/* XXX old interpret_macio_props looked for interrupts
-		   before AAPL,interrupts */
-		if (ints == NULL)
-			ints = (unsigned int *) get_property(np, "interrupts",
-							     &intlen);
-		if (ints == NULL)
-			return 0;
-
-		np->n_intrs = intlen / sizeof(unsigned int);
-		np->intrs = prom_alloc(np->n_intrs * sizeof(np->intrs[0]),
-				       mem_start);
-		if (!np->intrs)
-			return -ENOMEM;
-		if (measure_only)
-			return 0;
-
-		for (i = 0; i < np->n_intrs; ++i) {
-			np->intrs[i].line = *ints++;
-			np->intrs[i].sense = IRQ_SENSE_LEVEL
-				| IRQ_POLARITY_NEGATIVE;
-		}
-		return 0;
-	}
-
-	ints = (unsigned int *) get_property(np, "interrupts", &intlen);
-	TRACE("ints=%p, intlen=%d\n", ints, intlen);
-	if (ints == NULL)
-		return 0;
-	intrcells = prom_n_intr_cells(np);
-	intlen /= intrcells * sizeof(unsigned int);
-	TRACE("intrcells=%d, new intlen=%d\n", intrcells, intlen);
-	np->intrs = prom_alloc(intlen * sizeof(*(np->intrs)), mem_start);
-	if (!np->intrs)
-		return -ENOMEM;
-
-	if (measure_only)
-		return 0;
-
-	intrcount = 0;
-	for (i = 0; i < intlen; ++i, ints += intrcells) {
-		n = map_interrupt(&irq, &ic, np, ints, intrcells);
-		TRACE("map, irq=%d, ic=%p, n=%d\n", irq, ic, n);
-		if (n <= 0)
-			continue;
-
-		/* don't map IRQ numbers under a cascaded 8259 controller */
-		if (ic && device_is_compatible(ic, "chrp,iic")) {
-			np->intrs[intrcount].line = irq[0];
-			sense = (n > 1)? (irq[1] & 3): 3;
-			np->intrs[intrcount].sense = map_isa_senses[sense];
-		} else {
-			virq = virt_irq_create_mapping(irq[0]);
-			TRACE("virq=%d\n", virq);
-#ifdef CONFIG_PPC64
-			if (virq == NO_IRQ) {
-				printk(KERN_CRIT "Could not allocate interrupt"
-				       " number for %s\n", np->full_name);
-				continue;
-			}
-#endif
-			np->intrs[intrcount].line = irq_offset_up(virq);
-			sense = (n > 1)? (irq[1] & 3): 1;
-
-			/* Apple uses bits in there in a different way, let's
-			 * only keep the real sense bit on macs
-			 */
-			if (machine_is(powermac))
-				sense &= 0x1;
-			np->intrs[intrcount].sense = map_mpic_senses[sense];
-		}
-
-#ifdef CONFIG_PPC64
-		/* We offset irq numbers for the u3 MPIC by 128 in PowerMac */
-		if (machine_is(powermac) && ic && ic->parent) {
-			char *name = get_property(ic->parent, "name", NULL);
-			if (name && !strcmp(name, "u3"))
-				np->intrs[intrcount].line += 128;
-			else if (!(name && (!strcmp(name, "mac-io") ||
-					    !strcmp(name, "u4"))))
-				/* ignore other cascaded controllers, such as
-				   the k2-sata-root */
-				break;
-		}
-#endif /* CONFIG_PPC64 */
-		if (n > 2) {
-			printk("hmmm, got %d intr cells for %s:", n,
-			       np->full_name);
-			for (j = 0; j < n; ++j)
-				printk(" %d", irq[j]);
-			printk("\n");
-		}
-		++intrcount;
-	}
-	np->n_intrs = intrcount;
-
-	return 0;
-}
-
-static int __devinit finish_node(struct device_node *np,
-				 unsigned long *mem_start,
-				 int measure_only)
-{
-	struct device_node *child;
-	int rc = 0;
-
-	rc = finish_node_interrupts(np, mem_start, measure_only);
-	if (rc)
-		goto out;
-
-	for (child = np->child; child != NULL; child = child->sibling) {
-		rc = finish_node(child, mem_start, measure_only);
-		if (rc)
-			goto out;
-	}
-out:
-	return rc;
-}
-
-static void __init scan_interrupt_controllers(void)
-{
-	struct device_node *np;
-	int n = 0;
-	char *name, *ic;
-	int iclen;
-
-	for (np = allnodes; np != NULL; np = np->allnext) {
-		ic = get_property(np, "interrupt-controller", &iclen);
-		name = get_property(np, "name", NULL);
-		/* checking iclen makes sure we don't get a false
-		   match on /chosen.interrupt_controller */
-		if ((name != NULL
-		     && strcmp(name, "interrupt-controller") == 0)
-		    || (ic != NULL && iclen == 0
-			&& strcmp(name, "AppleKiwi"))) {
-			if (n == 0)
-				dflt_interrupt_controller = np;
-			++n;
-		}
-	}
-	num_interrupt_controllers = n;
-}
-
-/**
- * finish_device_tree is called once things are running normally
- * (i.e. with text and data mapped to the address they were linked at).
- * It traverses the device tree and fills in some of the additional,
- * fields in each node like {n_}addrs and {n_}intrs, the virt interrupt
- * mapping is also initialized at this point.
- */
-void __init finish_device_tree(void)
-{
-	unsigned long start, end, size = 0;
-
-	DBG(" -> finish_device_tree\n");
-
-#ifdef CONFIG_PPC64
-	/* Initialize virtual IRQ map */
-	virt_irq_init();
-#endif
-	scan_interrupt_controllers();
-
-	/*
-	 * Finish device-tree (pre-parsing some properties etc...)
-	 * We do this in 2 passes. One with "measure_only" set, which
-	 * will only measure the amount of memory needed, then we can
-	 * allocate that memory, and call finish_node again. However,
-	 * we must be careful as most routines will fail nowadays when
-	 * prom_alloc() returns 0, so we must make sure our first pass
-	 * doesn't start at 0. We pre-initialize size to 16 for that
-	 * reason and then remove those additional 16 bytes
-	 */
-	size = 16;
-	finish_node(allnodes, &size, 1);
-	size -= 16;
-
-	if (0 == size)
-		end = start = 0;
-	else
-		end = start = (unsigned long)__va(lmb_alloc(size, 128));
-
-	finish_node(allnodes, &end, 0);
-	BUG_ON(end != start + size);
-
-	DBG(" <- finish_device_tree\n");
-}
-
 static inline char *find_flat_dt_string(u32 offset)
 {
 	return ((char *)initial_boot_params) +
@@ -1388,27 +971,6 @@ prom_n_size_cells(struct device_node* np)
 }
 EXPORT_SYMBOL(prom_n_size_cells);
 
-/**
- * Work out the sense (active-low level / active-high edge)
- * of each interrupt from the device tree.
- */
-void __init prom_get_irq_senses(unsigned char *senses, int off, int max)
-{
-	struct device_node *np;
-	int i, j;
-
-	/* default to level-triggered */
-	memset(senses, IRQ_SENSE_LEVEL | IRQ_POLARITY_NEGATIVE, max - off);
-
-	for (np = allnodes; np != 0; np = np->allnext) {
-		for (j = 0; j < np->n_intrs; j++) {
-			i = np->intrs[j].line;
-			if (i >= off && i < max)
-				senses[i-off] = np->intrs[j].sense;
-		}
-	}
-}
-
 /**
  * Construct and return a list of the device_nodes with a given name.
  */
@@ -1808,7 +1370,6 @@ static void of_node_release(struct kref *kref)
 			node->deadprops = NULL;
 		}
 	}
-	kfree(node->intrs);
 	kfree(node->full_name);
 	kfree(node->data);
 	kfree(node);
@@ -1881,13 +1442,7 @@ void of_detach_node(const struct device_node *np)
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Fix up the uninitialized fields in a new device node:
- * name, type, n_addrs, addrs, n_intrs, intrs, and pci-specific fields
- *
- * A lot of boot-time code is duplicated here, because functions such
- * as finish_node_interrupts, interpret_pci_props, etc. cannot use the
- * slab allocator.
- *
- * This should probably be split up into smaller chunks.
+ * name, type and pci-specific fields
  */
 
 static int of_finish_dynamic_node(struct device_node *node)
@@ -1928,8 +1483,6 @@ static int prom_reconfig_notifier(struct notifier_block *nb,
 	switch (action) {
 	case PSERIES_RECONFIG_ADD:
 		err = of_finish_dynamic_node(node);
-		if (!err)
-			finish_node(node, NULL, 0);
 		if (err < 0) {
 			printk(KERN_ERR "finish_node returned %d\n", err);
 			err = NOTIFY_BAD;
diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index 6eb7e49b394a..cda022657324 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -297,19 +297,9 @@ unsigned long __init find_and_init_phbs(void)
 	struct device_node *node;
 	struct pci_controller *phb;
 	unsigned int index;
-	unsigned int root_size_cells = 0;
-	unsigned int *opprop = NULL;
 	struct device_node *root = of_find_node_by_path("/");
 
-	if (ppc64_interrupt_controller == IC_OPEN_PIC) {
-		opprop = (unsigned int *)get_property(root,
-				"platform-open-pic", NULL);
-	}
-
-	root_size_cells = prom_n_size_cells(root);
-
 	index = 0;
-
 	for (node = of_get_next_child(root, NULL);
 	     node != NULL;
 	     node = of_get_next_child(root, node)) {
@@ -324,13 +314,6 @@ unsigned long __init find_and_init_phbs(void)
 		setup_phb(node, phb);
 		pci_process_bridge_OF_ranges(phb, node, 0);
 		pci_setup_phb_io(phb, index == 0);
-#ifdef CONFIG_PPC_PSERIES
-		/* XXX This code need serious fixing ... --BenH */
-		if (ppc64_interrupt_controller == IC_OPEN_PIC && pSeries_mpic) {
-			int addr = root_size_cells * (index + 2) - 1;
-			mpic_assign_isu(pSeries_mpic, index, opprop[addr]);
-		}
-#endif
 		index++;
 	}
 
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 0c21de39c161..e0df2ba1ab9f 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -239,7 +239,6 @@ void __init setup_arch(char **cmdline_p)
 		ppc_md.init_early();
 
 	find_legacy_serial_ports();
-	finish_device_tree();
 
 	smp_setup_cpu_maps();
 
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ac7276c40685..fd1785e4c9bb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -361,12 +361,15 @@ void __init setup_system(void)
 
 	/*
 	 * Fill the ppc64_caches & systemcfg structures with informations
-	 * retrieved from the device-tree. Need to be called before
-	 * finish_device_tree() since the later requires some of the
-	 * informations filled up here to properly parse the interrupt tree.
+ 	 * retrieved from the device-tree.
 	 */
 	initialize_cache_info();
 
+	/*
+	 * Initialize irq remapping subsystem
+	 */
+	irq_early_init();
+
 #ifdef CONFIG_PPC_RTAS
 	/*
 	 * Initialize RTAS if available
@@ -393,12 +396,6 @@ void __init setup_system(void)
 	 */
 	find_legacy_serial_ports();
 
-	/*
-	 * "Finish" the device-tree, that is do the actual parsing of
-	 * some of the properties like the interrupt map
-	 */
-	finish_device_tree();
-
 	/*
 	 * Initialize xmon
 	 */
@@ -427,8 +424,6 @@ void __init setup_system(void)
 
 	printk("-----------------------------------------------------\n");
 	printk("ppc64_pft_size                = 0x%lx\n", ppc64_pft_size);
-	printk("ppc64_interrupt_controller    = 0x%ld\n",
-	       ppc64_interrupt_controller);
 	printk("physicalMemorySize            = 0x%lx\n", lmb_phys_mem_size());
 	printk("ppc64_caches.dcache_line_size = 0x%x\n",
 	       ppc64_caches.dline_size);
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index cdf5867838a6..fad8580f9081 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -218,7 +218,6 @@ struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node)
 {
 	struct vio_dev *viodev;
 	unsigned int *unit_address;
-	unsigned int *irq_p;
 
 	/* we need the 'device_type' property, in order to match with drivers */
 	if (of_node->type == NULL) {
@@ -243,16 +242,7 @@ struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node)
 
 	viodev->dev.platform_data = of_node_get(of_node);
 
-	viodev->irq = NO_IRQ;
-	irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL);
-	if (irq_p) {
-		int virq = virt_irq_create_mapping(*irq_p);
-		if (virq == NO_IRQ) {
-			printk(KERN_ERR "Unable to allocate interrupt "
-			       "number for %s\n", of_node->full_name);
-		} else
-			viodev->irq = irq_offset_up(virq);
-	}
+	viodev->irq = irq_of_parse_and_map(of_node, 0);
 
 	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address);
 	viodev->name = of_node->name;
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index 97936f547f19..9d5da7896892 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -1,6 +1,9 @@
 /*
  * Cell Internal Interrupt Controller
  *
+ * Copyright (C) 2006 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *                    IBM, Corp.
+ *
  * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
  *
  * Author: Arnd Bergmann <arndb@de.ibm.com>
@@ -25,11 +28,13 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/types.h>
+#include <linux/ioport.h>
 
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/prom.h>
 #include <asm/ptrace.h>
+#include <asm/machdep.h>
 
 #include "interrupt.h"
 #include "cbe_regs.h"
@@ -39,9 +44,25 @@ struct iic {
 	u8 target_id;
 	u8 eoi_stack[16];
 	int eoi_ptr;
+	struct irq_host *host;
 };
 
 static DEFINE_PER_CPU(struct iic, iic);
+#define IIC_NODE_COUNT	2
+static struct irq_host *iic_hosts[IIC_NODE_COUNT];
+
+/* Convert between "pending" bits and hw irq number */
+static irq_hw_number_t iic_pending_to_hwnum(struct cbe_iic_pending_bits bits)
+{
+	unsigned char unit = bits.source & 0xf;
+
+	if (bits.flags & CBE_IIC_IRQ_IPI)
+		return IIC_IRQ_IPI0 | (bits.prio >> 4);
+	else if (bits.class <= 3)
+		return (bits.class << 4) | unit;
+	else
+		return IIC_IRQ_INVALID;
+}
 
 static void iic_mask(unsigned int irq)
 {
@@ -65,197 +86,21 @@ static struct irq_chip iic_chip = {
 	.eoi = iic_eoi,
 };
 
-/* XXX All of this has to be reworked completely. We need to assign a real
- * interrupt numbers to the external interrupts and remove all the hard coded
- * interrupt maps (rely on the device-tree whenever possible).
- *
- * Basically, my scheme is to define the "pendings" bits to be the HW interrupt
- * number (ignoring the data and flags here). That means we can sort-of split
- * external sources based on priority, and we can use request_irq() on pretty
- * much anything.
- *
- * For spider or axon, they have their own interrupt space. spider will just have
- * local "hardward" interrupts 0...xx * node stride. The node stride is not
- * necessary (separate interrupt chips will have separate HW number space), but
- * will allow to be compatible with existing device-trees.
- *
- * All of thise little world will get a standard remapping scheme to map those HW
- * numbers into the linux flat irq number space.
-*/
-static int iic_external_get_irq(struct cbe_iic_pending_bits pending)
-{
-	int irq;
-	unsigned char node, unit;
-
-	node = pending.source >> 4;
-	unit = pending.source & 0xf;
-	irq = -1;
-
-	/*
-	 * This mapping is specific to the Cell Broadband
-	 * Engine. We might need to get the numbers
-	 * from the device tree to support future CPUs.
-	 */
-	switch (unit) {
-	case 0x00:
-	case 0x0b:
-		/*
-		 * One of these units can be connected
-		 * to an external interrupt controller.
-		 */
-		if (pending.class != 2)
-			break;
-		/* TODO: We might want to silently ignore cascade interrupts
-		 * when no cascade handler exist yet
-		 */
-		irq = IIC_EXT_CASCADE + node * IIC_NODE_STRIDE;
-		break;
-	case 0x01 ... 0x04:
-	case 0x07 ... 0x0a:
-		/*
-		 * These units are connected to the SPEs
-		 */
-		if (pending.class > 2)
-			break;
-		irq = IIC_SPE_OFFSET
-			+ pending.class * IIC_CLASS_STRIDE
-			+ node * IIC_NODE_STRIDE
-			+ unit;
-		break;
-	}
-	if (irq == -1)
-		printk(KERN_WARNING "Unexpected interrupt class %02x, "
-			"source %02x, prio %02x, cpu %02x\n", pending.class,
-			pending.source, pending.prio, smp_processor_id());
-	return irq;
-}
-
 /* Get an IRQ number from the pending state register of the IIC */
-int iic_get_irq(struct pt_regs *regs)
-{
-	struct iic *iic;
-	int irq;
-	struct cbe_iic_pending_bits pending;
-
-	iic = &__get_cpu_var(iic);
-	*(unsigned long *) &pending = 
-		in_be64((unsigned long __iomem *) &iic->regs->pending_destr);
-	iic->eoi_stack[++iic->eoi_ptr] = pending.prio;
-	BUG_ON(iic->eoi_ptr > 15);
-
-	irq = -1;
-	if (pending.flags & CBE_IIC_IRQ_VALID) {
-		if (pending.flags & CBE_IIC_IRQ_IPI) {
-			irq = IIC_IPI_OFFSET + (pending.prio >> 4);
-/*
-			if (irq > 0x80)
-				printk(KERN_WARNING "Unexpected IPI prio %02x"
-					"on CPU %02x\n", pending.prio,
-							smp_processor_id());
-*/
-		} else {
-			irq = iic_external_get_irq(pending);
-		}
-	}
-	return irq;
-}
-
-/* hardcoded part to be compatible with older firmware */
-
-static int __init setup_iic_hardcoded(void)
-{
-	struct device_node *np;
-	int nodeid, cpu;
-	unsigned long regs;
-	struct iic *iic;
-
-	for_each_possible_cpu(cpu) {
-		iic = &per_cpu(iic, cpu);
-		nodeid = cpu/2;
-
-		for (np = of_find_node_by_type(NULL, "cpu");
-		     np;
-		     np = of_find_node_by_type(np, "cpu")) {
-			if (nodeid == *(int *)get_property(np, "node-id", NULL))
-				break;
-			}
-
-		if (!np) {
-			printk(KERN_WARNING "IIC: CPU %d not found\n", cpu);
-			iic->regs = NULL;
-			iic->target_id = 0xff;
-			return -ENODEV;
-			}
-
-		regs = *(long *)get_property(np, "iic", NULL);
-
-		/* hack until we have decided on the devtree info */
-		regs += 0x400;
-		if (cpu & 1)
-			regs += 0x20;
-
-		printk(KERN_INFO "IIC for CPU %d at %lx\n", cpu, regs);
-		iic->regs = ioremap(regs, sizeof(struct cbe_iic_thread_regs));
-		iic->target_id = (nodeid << 4) + ((cpu & 1) ? 0xf : 0xe);
-		iic->eoi_stack[0] = 0xff;
-	}
-
-	return 0;
-}
-
-static int __init setup_iic(void)
+static unsigned int iic_get_irq(struct pt_regs *regs)
 {
-	struct device_node *dn;
-	unsigned long *regs;
-	char *compatible;
- 	unsigned *np, found = 0;
-	struct iic *iic = NULL;
-
-	for (dn = NULL; (dn = of_find_node_by_name(dn, "interrupt-controller"));) {
-		compatible = (char *)get_property(dn, "compatible", NULL);
-
-		if (!compatible) {
-			printk(KERN_WARNING "no compatible property found !\n");
-			continue;
-		}
-
- 		if (strstr(compatible, "IBM,CBEA-Internal-Interrupt-Controller"))
- 			regs = (unsigned long *)get_property(dn,"reg", NULL);
- 		else
-			continue;
-
- 		if (!regs)
- 			printk(KERN_WARNING "IIC: no reg property\n");
-
- 		np = (unsigned int *)get_property(dn, "ibm,interrupt-server-ranges", NULL);
-
- 		if (!np) {
-			printk(KERN_WARNING "IIC: CPU association not found\n");
-			iic->regs = NULL;
-			iic->target_id = 0xff;
-			return -ENODEV;
-		}
-
- 		iic = &per_cpu(iic, np[0]);
- 		iic->regs = ioremap(regs[0], sizeof(struct cbe_iic_thread_regs));
-		iic->target_id = ((np[0] & 2) << 3) + ((np[0] & 1) ? 0xf : 0xe);
-		iic->eoi_stack[0] = 0xff;
- 		printk("IIC for CPU %d at %lx mapped to %p\n", np[0], regs[0], iic->regs);
-
- 		iic = &per_cpu(iic, np[1]);
- 		iic->regs = ioremap(regs[2], sizeof(struct cbe_iic_thread_regs));
-		iic->target_id = ((np[1] & 2) << 3) + ((np[1] & 1) ? 0xf : 0xe);
-		iic->eoi_stack[0] = 0xff;
-
- 		printk("IIC for CPU %d at %lx mapped to %p\n", np[1], regs[2], iic->regs);
-
-		found++;
-  	}
-
-	if (found)
-		return 0;
-	else
-		return -ENODEV;
+  	struct cbe_iic_pending_bits pending;
+ 	struct iic *iic;
+
+ 	iic = &__get_cpu_var(iic);
+ 	*(unsigned long *) &pending =
+ 		in_be64((unsigned long __iomem *) &iic->regs->pending_destr);
+ 	iic->eoi_stack[++iic->eoi_ptr] = pending.prio;
+ 	BUG_ON(iic->eoi_ptr > 15);
+	if (pending.flags & CBE_IIC_IRQ_VALID)
+		return irq_linear_revmap(iic->host,
+ 					 iic_pending_to_hwnum(pending));
+	return NO_IRQ;
 }
 
 #ifdef CONFIG_SMP
@@ -263,12 +108,12 @@ static int __init setup_iic(void)
 /* Use the highest interrupt priorities for IPI */
 static inline int iic_ipi_to_irq(int ipi)
 {
-	return IIC_IPI_OFFSET + IIC_NUM_IPIS - 1 - ipi;
+	return IIC_IRQ_IPI0 + IIC_NUM_IPIS - 1 - ipi;
 }
 
 static inline int iic_irq_to_ipi(int irq)
 {
-	return IIC_NUM_IPIS - 1 - (irq - IIC_IPI_OFFSET);
+	return IIC_NUM_IPIS - 1 - (irq - IIC_IRQ_IPI0);
 }
 
 void iic_setup_cpu(void)
@@ -287,22 +132,51 @@ u8 iic_get_target_id(int cpu)
 }
 EXPORT_SYMBOL_GPL(iic_get_target_id);
 
+struct irq_host *iic_get_irq_host(int node)
+{
+	if (node < 0 || node >= IIC_NODE_COUNT)
+		return NULL;
+	return iic_hosts[node];
+}
+EXPORT_SYMBOL_GPL(iic_get_irq_host);
+
+
 static irqreturn_t iic_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
 {
-	smp_message_recv(iic_irq_to_ipi(irq), regs);
+	int ipi = (int)(long)dev_id;
+
+	smp_message_recv(ipi, regs);
+
 	return IRQ_HANDLED;
 }
 
 static void iic_request_ipi(int ipi, const char *name)
 {
-	int irq;
+	int node, virq;
 
-	irq = iic_ipi_to_irq(ipi);
-
-	/* IPIs are marked IRQF_DISABLED as they must run with irqs
-	 * disabled */
- 	set_irq_chip_and_handler(irq, &iic_chip, handle_percpu_irq);
-	request_irq(irq, iic_ipi_action, IRQF_DISABLED, name, NULL);
+	for (node = 0; node < IIC_NODE_COUNT; node++) {
+		char *rname;
+		if (iic_hosts[node] == NULL)
+			continue;
+		virq = irq_create_mapping(iic_hosts[node],
+					  iic_ipi_to_irq(ipi), 0);
+		if (virq == NO_IRQ) {
+			printk(KERN_ERR
+			       "iic: failed to map IPI %s on node %d\n",
+			       name, node);
+			continue;
+		}
+		rname = kzalloc(strlen(name) + 16, GFP_KERNEL);
+		if (rname)
+			sprintf(rname, "%s node %d", name, node);
+		else
+			rname = (char *)name;
+		if (request_irq(virq, iic_ipi_action, IRQF_DISABLED,
+				rname, (void *)(long)ipi))
+			printk(KERN_ERR
+			       "iic: failed to request IPI %s on node %d\n",
+			       name, node);
+	}
 }
 
 void iic_request_IPIs(void)
@@ -313,41 +187,119 @@ void iic_request_IPIs(void)
 	iic_request_ipi(PPC_MSG_DEBUGGER_BREAK, "IPI-debug");
 #endif /* CONFIG_DEBUGGER */
 }
+
 #endif /* CONFIG_SMP */
 
-static void __init iic_setup_builtin_handlers(void)
+
+static int iic_host_match(struct irq_host *h, struct device_node *node)
+{
+	return h->host_data != NULL && node == h->host_data;
+}
+
+static int iic_host_map(struct irq_host *h, unsigned int virq,
+			irq_hw_number_t hw, unsigned int flags)
+{
+	if (hw < IIC_IRQ_IPI0)
+		set_irq_chip_and_handler(virq, &iic_chip, handle_fasteoi_irq);
+	else
+		set_irq_chip_and_handler(virq, &iic_chip, handle_percpu_irq);
+	return 0;
+}
+
+static int iic_host_xlate(struct irq_host *h, struct device_node *ct,
+			   u32 *intspec, unsigned int intsize,
+			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+
 {
-	int be, isrc;
+	/* Currently, we don't translate anything. That needs to be fixed as
+	 * we get better defined device-trees. iic interrupts have to be
+	 * explicitely mapped by whoever needs them
+	 */
+	return -ENODEV;
+}
+
+static struct irq_host_ops iic_host_ops = {
+	.match = iic_host_match,
+	.map = iic_host_map,
+	.xlate = iic_host_xlate,
+};
+
+static void __init init_one_iic(unsigned int hw_cpu, unsigned long addr,
+				struct irq_host *host)
+{
+	/* XXX FIXME: should locate the linux CPU number from the HW cpu
+	 * number properly. We are lucky for now
+	 */
+	struct iic *iic = &per_cpu(iic, hw_cpu);
 
-	/* XXX FIXME: Assume two threads per BE are present */
-	for (be=0; be < num_present_cpus() / 2; be++) {
-		int irq;
+	iic->regs = ioremap(addr, sizeof(struct cbe_iic_thread_regs));
+	BUG_ON(iic->regs == NULL);
 
-		/* setup SPE chip and handlers */
-		for (isrc = 0; isrc < IIC_CLASS_STRIDE * 3; isrc++) {
-			irq = IIC_NODE_STRIDE * be + IIC_SPE_OFFSET + isrc;
-			set_irq_chip_and_handler(irq, &iic_chip, handle_fasteoi_irq);
+	iic->target_id = ((hw_cpu & 2) << 3) | ((hw_cpu & 1) ? 0xf : 0xe);
+	iic->eoi_stack[0] = 0xff;
+	iic->host = host;
+	out_be64(&iic->regs->prio, 0);
+
+	printk(KERN_INFO "IIC for CPU %d at %lx mapped to %p, target id 0x%x\n",
+	       hw_cpu, addr, iic->regs, iic->target_id);
+}
+
+static int __init setup_iic(void)
+{
+	struct device_node *dn;
+	struct resource r0, r1;
+	struct irq_host *host;
+	int found = 0;
+ 	u32 *np;
+
+	for (dn = NULL;
+	     (dn = of_find_node_by_name(dn,"interrupt-controller")) != NULL;) {
+		if (!device_is_compatible(dn,
+				     "IBM,CBEA-Internal-Interrupt-Controller"))
+			continue;
+ 		np = (u32 *)get_property(dn, "ibm,interrupt-server-ranges",
+					 NULL);
+ 		if (np == NULL) {
+			printk(KERN_WARNING "IIC: CPU association not found\n");
+			of_node_put(dn);
+			return -ENODEV;
 		}
-		/* setup cascade chip */
-		irq = IIC_EXT_CASCADE + be * IIC_NODE_STRIDE;
-		set_irq_chip_and_handler(irq, &iic_chip, handle_fasteoi_irq);
+		if (of_address_to_resource(dn, 0, &r0) ||
+		    of_address_to_resource(dn, 1, &r1)) {
+			printk(KERN_WARNING "IIC: Can't resolve addresses\n");
+			of_node_put(dn);
+			return -ENODEV;
+		}
+		host = NULL;
+		if (found < IIC_NODE_COUNT) {
+			host = irq_alloc_host(IRQ_HOST_MAP_LINEAR,
+					      IIC_SOURCE_COUNT,
+					      &iic_host_ops,
+					      IIC_IRQ_INVALID);
+			iic_hosts[found] = host;
+			BUG_ON(iic_hosts[found] == NULL);
+			iic_hosts[found]->host_data = of_node_get(dn);
+			found++;
+		}
+		init_one_iic(np[0], r0.start, host);
+		init_one_iic(np[1], r1.start, host);
 	}
+
+	if (found)
+		return 0;
+	else
+		return -ENODEV;
 }
 
 void __init iic_init_IRQ(void)
 {
-	int cpu, irq_offset;
-	struct iic *iic;
-
+	/* Discover and initialize iics */
 	if (setup_iic() < 0)
-		setup_iic_hardcoded();
+		panic("IIC: Failed to initialize !\n");
 
-	irq_offset = 0;
-	for_each_possible_cpu(cpu) {
-		iic = &per_cpu(iic, cpu);
-		if (iic->regs)
-			out_be64(&iic->regs->prio, 0xff);
-	}
-	iic_setup_builtin_handlers();
+	/* Set master interrupt handling function */
+	ppc_md.get_irq = iic_get_irq;
 
+	/* Enable on current CPU */
+	iic_setup_cpu();
 }
diff --git a/arch/powerpc/platforms/cell/interrupt.h b/arch/powerpc/platforms/cell/interrupt.h
index c74515aeb630..5560a92ec3ab 100644
--- a/arch/powerpc/platforms/cell/interrupt.h
+++ b/arch/powerpc/platforms/cell/interrupt.h
@@ -37,23 +37,22 @@
  */
 
 enum {
-	IIC_EXT_OFFSET   = 0x00, /* Start of south bridge IRQs */
-	IIC_EXT_CASCADE  = 0x20, /* There is no interrupt 32 on spider */
-	IIC_NUM_EXT      = 0x40, /* Number of south bridge IRQs */
-	IIC_SPE_OFFSET   = 0x40, /* Start of SPE interrupts */
-	IIC_CLASS_STRIDE = 0x10, /* SPE IRQs per class    */
-	IIC_IPI_OFFSET   = 0x70, /* Start of IPI IRQs */
-	IIC_NUM_IPIS     = 0x10, /* IRQs reserved for IPI */
-	IIC_NODE_STRIDE  = 0x80, /* Total IRQs per node   */
+	IIC_IRQ_INVALID		= 0xff,
+	IIC_IRQ_MAX		= 0x3f,
+	IIC_IRQ_EXT_IOIF0	= 0x20,
+	IIC_IRQ_EXT_IOIF1	= 0x2b,
+	IIC_IRQ_IPI0		= 0x40,
+	IIC_NUM_IPIS    	= 0x10, /* IRQs reserved for IPI */
+	IIC_SOURCE_COUNT	= 0x50,
 };
 
 extern void iic_init_IRQ(void);
-extern int  iic_get_irq(struct pt_regs *regs);
 extern void iic_cause_IPI(int cpu, int mesg);
 extern void iic_request_IPIs(void);
 extern void iic_setup_cpu(void);
 
 extern u8 iic_get_target_id(int cpu);
+extern struct irq_host *iic_get_irq_host(int node);
 
 extern void spider_init_IRQ(void);
 
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index 70a4e903f7e2..282987d6d4a2 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -80,6 +80,14 @@ static void cell_progress(char *s, unsigned short hex)
 	printk("*** %04x : %s\n", hex, s ? s : "");
 }
 
+static void __init cell_pcibios_fixup(void)
+{
+	struct pci_dev *dev = NULL;
+
+	for_each_pci_dev(dev)
+		pci_read_irq_line(dev);
+}
+
 static void __init cell_init_irq(void)
 {
 	iic_init_IRQ();
@@ -130,8 +138,6 @@ static void __init cell_init_early(void)
 
 	cell_init_iommu();
 
-	ppc64_interrupt_controller = IC_CELL_PIC;
-
 	DBG(" <- cell_init_early()\n");
 }
 
@@ -178,8 +184,7 @@ define_machine(cell) {
 	.check_legacy_ioport	= cell_check_legacy_ioport,
 	.progress		= cell_progress,
 	.init_IRQ       	= cell_init_irq,
-	.get_irq        	= iic_get_irq,
-
+	.pcibios_fixup		= cell_pcibios_fixup,
 #ifdef CONFIG_KEXEC
 	.machine_kexec		= default_machine_kexec,
 	.machine_kexec_prepare	= default_machine_kexec_prepare,
diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c
index 98425acb6cda..ae7ef88f1a37 100644
--- a/arch/powerpc/platforms/cell/spider-pic.c
+++ b/arch/powerpc/platforms/cell/spider-pic.c
@@ -22,6 +22,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/ioport.h>
 
 #include <asm/pgtable.h>
 #include <asm/prom.h>
@@ -56,58 +57,67 @@ enum {
 	REISWAITEN	= 0x508, /* Reissue Wait Control*/
 };
 
-static void __iomem *spider_pics[4];
+#define SPIDER_CHIP_COUNT	4
+#define SPIDER_SRC_COUNT	64
+#define SPIDER_IRQ_INVALID	63
 
-static void __iomem *spider_get_pic(int irq)
-{
-	int node = irq / IIC_NODE_STRIDE;
-	irq %= IIC_NODE_STRIDE;
-
-	if (irq >= IIC_EXT_OFFSET &&
-	    irq < IIC_EXT_OFFSET + IIC_NUM_EXT &&
-	    spider_pics)
-		return spider_pics[node];
-	return NULL;
-}
+struct spider_pic {
+	struct irq_host		*host;
+	struct device_node	*of_node;
+	void __iomem		*regs;
+	unsigned int		node_id;
+};
+static struct spider_pic spider_pics[SPIDER_CHIP_COUNT];
 
-static int spider_get_nr(unsigned int irq)
+static struct spider_pic *spider_virq_to_pic(unsigned int virq)
 {
-	return (irq % IIC_NODE_STRIDE) - IIC_EXT_OFFSET;
+	return irq_map[virq].host->host_data;
 }
 
-static void __iomem *spider_get_irq_config(int irq)
+static void __iomem *spider_get_irq_config(struct spider_pic *pic,
+					   unsigned int src)
 {
-	void __iomem *pic;
-	pic = spider_get_pic(irq);
-	return pic + TIR_CFGA + 8 * spider_get_nr(irq);
+	return pic->regs + TIR_CFGA + 8 * src;
 }
 
-static void spider_unmask_irq(unsigned int irq)
+static void spider_unmask_irq(unsigned int virq)
 {
-	int nodeid = (irq / IIC_NODE_STRIDE) * 0x10;
-	void __iomem *cfg = spider_get_irq_config(irq);
-	irq = spider_get_nr(irq);
+	struct spider_pic *pic = spider_virq_to_pic(virq);
+	void __iomem *cfg = spider_get_irq_config(pic, irq_map[virq].hwirq);
 
-	/* FIXME: Most of that is configuration and has nothing to do with enabling/disable,
-	 * besides, it's also partially bogus.
+	/* We use no locking as we should be covered by the descriptor lock
+	 * for access to invidual source configuration registers
 	 */
-	out_be32(cfg, (in_be32(cfg) & ~0xf0)| 0x3107000eu | nodeid);
-	out_be32(cfg + 4, in_be32(cfg + 4) | 0x00020000u | irq);
+	out_be32(cfg, in_be32(cfg) | 0x30000000u);
 }
 
-static void spider_mask_irq(unsigned int irq)
+static void spider_mask_irq(unsigned int virq)
 {
-	void __iomem *cfg = spider_get_irq_config(irq);
-	irq = spider_get_nr(irq);
+	struct spider_pic *pic = spider_virq_to_pic(virq);
+	void __iomem *cfg = spider_get_irq_config(pic, irq_map[virq].hwirq);
 
+	/* We use no locking as we should be covered by the descriptor lock
+	 * for access to invidual source configuration registers
+	 */
 	out_be32(cfg, in_be32(cfg) & ~0x30000000u);
 }
 
-static void spider_ack_irq(unsigned int irq)
+static void spider_ack_irq(unsigned int virq)
 {
-	/* Should reset edge detection logic but we don't configure any edge interrupt
-	 * at the moment.
+	struct spider_pic *pic = spider_virq_to_pic(virq);
+	unsigned int src = irq_map[virq].hwirq;
+
+	/* Reset edge detection logic if necessary
 	 */
+	if (get_irq_desc(virq)->status & IRQ_LEVEL)
+		return;
+
+	/* Only interrupts 47 to 50 can be set to edge */
+	if (src < 47 || src > 50)
+		return;
+
+	/* Perform the clear of the edge logic */
+	out_be32(pic->regs + TIR_EDC, 0x100 | (src & 0xf));
 }
 
 static struct irq_chip spider_pic = {
@@ -117,102 +127,243 @@ static struct irq_chip spider_pic = {
 	.ack = spider_ack_irq,
 };
 
-static int spider_get_irq(int node)
+static int spider_host_match(struct irq_host *h, struct device_node *node)
+{
+	struct spider_pic *pic = h->host_data;
+	return node == pic->of_node;
+}
+
+static int spider_host_map(struct irq_host *h, unsigned int virq,
+			irq_hw_number_t hw, unsigned int flags)
 {
-	unsigned long cs;
-	void __iomem *regs = spider_pics[node];
+	unsigned int sense = flags & IRQ_TYPE_SENSE_MASK;
+	struct spider_pic *pic = h->host_data;
+	void __iomem *cfg = spider_get_irq_config(pic, hw);
+	int level = 0;
+	u32 ic;
+
+	/* Note that only level high is supported for most interrupts */
+	if (sense != IRQ_TYPE_NONE && sense != IRQ_TYPE_LEVEL_HIGH &&
+	    (hw < 47 || hw > 50))
+		return -EINVAL;
+
+	/* Decode sense type */
+	switch(sense) {
+	case IRQ_TYPE_EDGE_RISING:
+		ic = 0x3;
+		break;
+	case IRQ_TYPE_EDGE_FALLING:
+		ic = 0x2;
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		ic = 0x0;
+		level = 1;
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+	case IRQ_TYPE_NONE:
+		ic = 0x1;
+		level = 1;
+		break;
+	default:
+		return -EINVAL;
+	}
 
-	cs = in_be32(regs + TIR_CS) >> 24;
+	/* Configure the source. One gross hack that was there before and
+	 * that I've kept around is the priority to the BE which I set to
+	 * be the same as the interrupt source number. I don't know wether
+	 * that's supposed to make any kind of sense however, we'll have to
+	 * decide that, but for now, I'm not changing the behaviour.
+	 */
+	out_be32(cfg, (ic << 24) | (0x7 << 16) | (pic->node_id << 4) | 0xe);
+	out_be32(cfg + 4, (0x2 << 16) | (hw & 0xff));
 
-	if (cs == 63)
-		return -1;
-	else
-		return cs;
+	if (level)
+		get_irq_desc(virq)->status |= IRQ_LEVEL;
+	set_irq_chip_and_handler(virq, &spider_pic, handle_level_irq);
+	return 0;
+}
+
+static int spider_host_xlate(struct irq_host *h, struct device_node *ct,
+			   u32 *intspec, unsigned int intsize,
+			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+
+{
+	/* Spider interrupts have 2 cells, first is the interrupt source,
+	 * second, well, I don't know for sure yet ... We mask the top bits
+	 * because old device-trees encode a node number in there
+	 */
+	*out_hwirq = intspec[0] & 0x3f;
+	*out_flags = IRQ_TYPE_LEVEL_HIGH;
+	return 0;
 }
 
+static struct irq_host_ops spider_host_ops = {
+	.match = spider_host_match,
+	.map = spider_host_map,
+	.xlate = spider_host_xlate,
+};
+
 static void spider_irq_cascade(unsigned int irq, struct irq_desc *desc,
 			       struct pt_regs *regs)
 {
-	int node = (int)(long)desc->handler_data;
-	int cascade_irq;
+	struct spider_pic *pic = desc->handler_data;
+	unsigned int cs, virq;
 
-	cascade_irq = spider_get_irq(node);
-	generic_handle_irq(cascade_irq, regs);
+	cs = in_be32(pic->regs + TIR_CS) >> 24;
+	if (cs == SPIDER_IRQ_INVALID)
+		virq = NO_IRQ;
+	else
+		virq = irq_linear_revmap(pic->host, cs);
+	if (virq != NO_IRQ)
+		generic_handle_irq(virq, regs);
 	desc->chip->eoi(irq);
 }
 
-/* hardcoded part to be compatible with older firmware */
+/* For hooking up the cascace we have a problem. Our device-tree is
+ * crap and we don't know on which BE iic interrupt we are hooked on at
+ * least not the "standard" way. We can reconstitute it based on two
+ * informations though: which BE node we are connected to and wether
+ * we are connected to IOIF0 or IOIF1. Right now, we really only care
+ * about the IBM cell blade and we know that its firmware gives us an
+ * interrupt-map property which is pretty strange.
+ */
+static unsigned int __init spider_find_cascade_and_node(struct spider_pic *pic)
+{
+	unsigned int virq;
+	u32 *imap, *tmp;
+	int imaplen, intsize, unit;
+	struct device_node *iic;
+	struct irq_host *iic_host;
+
+#if 0 /* Enable that when we have a way to retreive the node as well */
+	/* First, we check wether we have a real "interrupts" in the device
+	 * tree in case the device-tree is ever fixed
+	 */
+	struct of_irq oirq;
+	if (of_irq_map_one(pic->of_node, 0, &oirq) == 0) {
+		virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
+					     oirq.size);
+		goto bail;
+	}
+#endif
+
+	/* Now do the horrible hacks */
+	tmp = (u32 *)get_property(pic->of_node, "#interrupt-cells", NULL);
+	if (tmp == NULL)
+		return NO_IRQ;
+	intsize = *tmp;
+	imap = (u32 *)get_property(pic->of_node, "interrupt-map", &imaplen);
+	if (imap == NULL || imaplen < (intsize + 1))
+		return NO_IRQ;
+	iic = of_find_node_by_phandle(imap[intsize]);
+	if (iic == NULL)
+		return NO_IRQ;
+	imap += intsize + 1;
+	tmp = (u32 *)get_property(iic, "#interrupt-cells", NULL);
+	if (tmp == NULL)
+		return NO_IRQ;
+	intsize = *tmp;
+	/* Assume unit is last entry of interrupt specifier */
+	unit = imap[intsize - 1];
+	/* Ok, we have a unit, now let's try to get the node */
+	tmp = (u32 *)get_property(iic, "ibm,interrupt-server-ranges", NULL);
+	if (tmp == NULL) {
+		of_node_put(iic);
+		return NO_IRQ;
+	}
+	/* ugly as hell but works for now */
+	pic->node_id = (*tmp) >> 1;
+	of_node_put(iic);
+
+	/* Ok, now let's get cracking. You may ask me why I just didn't match
+	 * the iic host from the iic OF node, but that way I'm still compatible
+	 * with really really old old firmwares for which we don't have a node
+	 */
+	iic_host = iic_get_irq_host(pic->node_id);
+	if (iic_host == NULL)
+		return NO_IRQ;
+	/* Manufacture an IIC interrupt number of class 2 */
+	virq = irq_create_mapping(iic_host, 0x20 | unit, 0);
+	if (virq == NO_IRQ)
+		printk(KERN_ERR "spider_pic: failed to map cascade !");
+	return virq;
+}
+
 
-static void __init spider_init_one(int node, unsigned long addr)
+static void __init spider_init_one(struct device_node *of_node, int chip,
+				   unsigned long addr)
 {
-	int n, irq;
+	struct spider_pic *pic = &spider_pics[chip];
+	int i, virq;
 
-	spider_pics[node] = ioremap(addr, 0x800);
-	if (spider_pics[node] == NULL)
+	/* Map registers */
+	pic->regs = ioremap(addr, 0x1000);
+	if (pic->regs == NULL)
 		panic("spider_pic: can't map registers !");
 
-	printk(KERN_INFO "spider_pic: mapped for node %d, addr: 0x%lx mapped to %p\n",
-	       node, addr, spider_pics[node]);
+	/* Allocate a host */
+	pic->host = irq_alloc_host(IRQ_HOST_MAP_LINEAR, SPIDER_SRC_COUNT,
+				   &spider_host_ops, SPIDER_IRQ_INVALID);
+	if (pic->host == NULL)
+		panic("spider_pic: can't allocate irq host !");
+	pic->host->host_data = pic;
 
-	for (n = 0; n < IIC_NUM_EXT; n++) {
-		if (n == IIC_EXT_CASCADE)
-			continue;
-		irq = n + IIC_EXT_OFFSET + node * IIC_NODE_STRIDE;
-		set_irq_chip_and_handler(irq, &spider_pic, handle_level_irq);
-		get_irq_desc(irq)->status |= IRQ_LEVEL;
+	/* Fill out other bits */
+	pic->of_node = of_node_get(of_node);
+
+	/* Go through all sources and disable them */
+	for (i = 0; i < SPIDER_SRC_COUNT; i++) {
+		void __iomem *cfg = pic->regs + TIR_CFGA + 8 * i;
+		out_be32(cfg, in_be32(cfg) & ~0x30000000u);
 	}
 
 	/* do not mask any interrupts because of level */
-	out_be32(spider_pics[node] + TIR_MSK, 0x0);
-
-	/* disable edge detection clear */
-	/* out_be32(spider_pics[node] + TIR_EDC, 0x0); */
+	out_be32(pic->regs + TIR_MSK, 0x0);
 
 	/* enable interrupt packets to be output */
-	out_be32(spider_pics[node] + TIR_PIEN,
-		 in_be32(spider_pics[node] + TIR_PIEN) | 0x1);
+	out_be32(pic->regs + TIR_PIEN, in_be32(pic->regs + TIR_PIEN) | 0x1);
 
-	/* Hook up cascade */
-	irq = IIC_EXT_CASCADE + node * IIC_NODE_STRIDE;
-	set_irq_data(irq, (void *)(long)node);
-	set_irq_chained_handler(irq, spider_irq_cascade);
+	/* Hook up the cascade interrupt to the iic and nodeid */
+	virq = spider_find_cascade_and_node(pic);
+	if (virq == NO_IRQ)
+		return;
+	set_irq_data(virq, pic);
+	set_irq_chained_handler(virq, spider_irq_cascade);
+
+	printk(KERN_INFO "spider_pic: node %d, addr: 0x%lx %s\n",
+	       pic->node_id, addr, of_node->full_name);
 
 	/* Enable the interrupt detection enable bit. Do this last! */
-	out_be32(spider_pics[node] + TIR_DEN,
-		 in_be32(spider_pics[node] + TIR_DEN) | 0x1);
+	out_be32(pic->regs + TIR_DEN, in_be32(pic->regs + TIR_DEN) | 0x1);
 }
 
 void __init spider_init_IRQ(void)
 {
-	unsigned long *spider_reg;
+	struct resource r;
 	struct device_node *dn;
-	char *compatible;
-	int node = 0;
-
-	/* XXX node numbers are totally bogus. We _hope_ we get the device nodes in the right
-	 * order here but that's definitely not guaranteed, we need to get the node from the
-	 * device tree instead. There is currently no proper property for it (but our whole
-	 * device-tree is bogus anyway) so all we can do is pray or maybe test the address
-	 * and deduce the node-id
+	int chip = 0;
+
+	/* XXX node numbers are totally bogus. We _hope_ we get the device
+	 * nodes in the right order here but that's definitely not guaranteed,
+	 * we need to get the node from the device tree instead.
+	 * There is currently no proper property for it (but our whole
+	 * device-tree is bogus anyway) so all we can do is pray or maybe test
+	 * the address and deduce the node-id
 	 */
-	for (dn = NULL; (dn = of_find_node_by_name(dn, "interrupt-controller"));) {
-		compatible = (char *)get_property(dn, "compatible", NULL);
-
-		if (!compatible)
-			continue;
-
- 		if (strstr(compatible, "CBEA,platform-spider-pic"))
-			spider_reg = (unsigned long *)get_property(dn, "reg", NULL);
-		else if (strstr(compatible, "sti,platform-spider-pic") && (node < 2)) {
-			static long hard_coded_pics[] = { 0x24000008000, 0x34000008000 };
-			spider_reg = &hard_coded_pics[node];
+	for (dn = NULL;
+	     (dn = of_find_node_by_name(dn, "interrupt-controller"));) {
+		if (device_is_compatible(dn, "CBEA,platform-spider-pic")) {
+			if (of_address_to_resource(dn, 0, &r)) {
+				printk(KERN_WARNING "spider-pic: Failed\n");
+				continue;
+			}
+		} else if (device_is_compatible(dn, "sti,platform-spider-pic")
+			   && (chip < 2)) {
+			static long hard_coded_pics[] =
+				{ 0x24000008000, 0x34000008000 };
+			r.start = hard_coded_pics[chip];
 		} else
 			continue;
-
-		if (spider_reg == NULL)
-			printk(KERN_ERR "spider_pic: No address for node %d\n", node);
-
-		spider_init_one(node, *spider_reg);
-		node++;
+		spider_init_one(dn, chip++, r.start);
 	}
 }
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 656c1ef5f4ad..5d2313a6c82b 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -264,51 +264,57 @@ spu_irq_class_2(int irq, void *data, struct pt_regs *regs)
 	return stat ? IRQ_HANDLED : IRQ_NONE;
 }
 
-static int
-spu_request_irqs(struct spu *spu)
+static int spu_request_irqs(struct spu *spu)
 {
-	int ret;
-	int irq_base;
-
-	irq_base = IIC_NODE_STRIDE * spu->node + IIC_SPE_OFFSET;
-
-	snprintf(spu->irq_c0, sizeof (spu->irq_c0), "spe%02d.0", spu->number);
-	ret = request_irq(irq_base + spu->isrc,
-		 spu_irq_class_0, IRQF_DISABLED, spu->irq_c0, spu);
-	if (ret)
-		goto out;
-
-	snprintf(spu->irq_c1, sizeof (spu->irq_c1), "spe%02d.1", spu->number);
-	ret = request_irq(irq_base + IIC_CLASS_STRIDE + spu->isrc,
-		 spu_irq_class_1, IRQF_DISABLED, spu->irq_c1, spu);
-	if (ret)
-		goto out1;
+	int ret = 0;
 
-	snprintf(spu->irq_c2, sizeof (spu->irq_c2), "spe%02d.2", spu->number);
-	ret = request_irq(irq_base + 2*IIC_CLASS_STRIDE + spu->isrc,
-		 spu_irq_class_2, IRQF_DISABLED, spu->irq_c2, spu);
-	if (ret)
-		goto out2;
-	goto out;
+	if (spu->irqs[0] != NO_IRQ) {
+		snprintf(spu->irq_c0, sizeof (spu->irq_c0), "spe%02d.0",
+			 spu->number);
+		ret = request_irq(spu->irqs[0], spu_irq_class_0,
+				  IRQF_DISABLED,
+				  spu->irq_c0, spu);
+		if (ret)
+			goto bail0;
+	}
+	if (spu->irqs[1] != NO_IRQ) {
+		snprintf(spu->irq_c1, sizeof (spu->irq_c1), "spe%02d.1",
+			 spu->number);
+		ret = request_irq(spu->irqs[1], spu_irq_class_1,
+				  IRQF_DISABLED,
+				  spu->irq_c1, spu);
+		if (ret)
+			goto bail1;
+	}
+	if (spu->irqs[2] != NO_IRQ) {
+		snprintf(spu->irq_c2, sizeof (spu->irq_c2), "spe%02d.2",
+			 spu->number);
+		ret = request_irq(spu->irqs[2], spu_irq_class_2,
+				  IRQF_DISABLED,
+				  spu->irq_c2, spu);
+		if (ret)
+			goto bail2;
+	}
+	return 0;
 
-out2:
-	free_irq(irq_base + IIC_CLASS_STRIDE + spu->isrc, spu);
-out1:
-	free_irq(irq_base + spu->isrc, spu);
-out:
+bail2:
+	if (spu->irqs[1] != NO_IRQ)
+		free_irq(spu->irqs[1], spu);
+bail1:
+	if (spu->irqs[0] != NO_IRQ)
+		free_irq(spu->irqs[0], spu);
+bail0:
 	return ret;
 }
 
-static void
-spu_free_irqs(struct spu *spu)
+static void spu_free_irqs(struct spu *spu)
 {
-	int irq_base;
-
-	irq_base = IIC_NODE_STRIDE * spu->node + IIC_SPE_OFFSET;
-
-	free_irq(irq_base + spu->isrc, spu);
-	free_irq(irq_base + IIC_CLASS_STRIDE + spu->isrc, spu);
-	free_irq(irq_base + 2*IIC_CLASS_STRIDE + spu->isrc, spu);
+	if (spu->irqs[0] != NO_IRQ)
+		free_irq(spu->irqs[0], spu);
+	if (spu->irqs[1] != NO_IRQ)
+		free_irq(spu->irqs[1], spu);
+	if (spu->irqs[2] != NO_IRQ)
+		free_irq(spu->irqs[2], spu);
 }
 
 static LIST_HEAD(spu_list);
@@ -559,17 +565,38 @@ static void spu_unmap(struct spu *spu)
 	iounmap((u8 __iomem *)spu->local_store);
 }
 
+/* This function shall be abstracted for HV platforms */
+static int __init spu_map_interrupts(struct spu *spu, struct device_node *np)
+{
+	struct irq_host *host;
+	unsigned int isrc;
+	u32 *tmp;
+
+	host = iic_get_irq_host(spu->node);
+	if (host == NULL)
+		return -ENODEV;
+
+	/* Get the interrupt source from the device-tree */
+	tmp = (u32 *)get_property(np, "isrc", NULL);
+	if (!tmp)
+		return -ENODEV;
+	spu->isrc = isrc = tmp[0];
+
+	/* Now map interrupts of all 3 classes */
+	spu->irqs[0] = irq_create_mapping(host, 0x00 | isrc, 0);
+	spu->irqs[1] = irq_create_mapping(host, 0x10 | isrc, 0);
+	spu->irqs[2] = irq_create_mapping(host, 0x20 | isrc, 0);
+
+	/* Right now, we only fail if class 2 failed */
+	return spu->irqs[2] == NO_IRQ ? -EINVAL : 0;
+}
+
 static int __init spu_map_device(struct spu *spu, struct device_node *node)
 {
 	char *prop;
 	int ret;
 
 	ret = -ENODEV;
-	prop = get_property(node, "isrc", NULL);
-	if (!prop)
-		goto out;
-	spu->isrc = *(unsigned int *)prop;
-
 	spu->name = get_property(node, "name", NULL);
 	if (!spu->name)
 		goto out;
@@ -636,7 +663,8 @@ static int spu_create_sysdev(struct spu *spu)
 		return ret;
 	}
 
-	sysdev_create_file(&spu->sysdev, &attr_isrc);
+	if (spu->isrc != 0)
+		sysdev_create_file(&spu->sysdev, &attr_isrc);
 	sysfs_add_device_to_node(&spu->sysdev, spu->nid);
 
 	return 0;
@@ -668,6 +696,9 @@ static int __init create_spu(struct device_node *spe)
 	spu->nid = of_node_to_nid(spe);
 	if (spu->nid == -1)
 		spu->nid = 0;
+	ret = spu_map_interrupts(spu, spe);
+	if (ret)
+		goto out_unmap;
 	spin_lock_init(&spu->register_lock);
 	spu_mfc_sdr_set(spu, mfspr(SPRN_SDR1));
 	spu_mfc_sr1_set(spu, 0x33);
diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c
index 66c253498803..6802cdc3168a 100644
--- a/arch/powerpc/platforms/chrp/pci.c
+++ b/arch/powerpc/platforms/chrp/pci.c
@@ -18,7 +18,6 @@
 #include <asm/machdep.h>
 #include <asm/sections.h>
 #include <asm/pci-bridge.h>
-#include <asm/open_pic.h>
 #include <asm/grackle.h>
 #include <asm/rtas.h>
 
@@ -161,15 +160,9 @@ void __init
 chrp_pcibios_fixup(void)
 {
 	struct pci_dev *dev = NULL;
-	struct device_node *np;
 
-	/* PCI interrupts are controlled by the OpenPIC */
-	for_each_pci_dev(dev) {
-		np = pci_device_to_OF_node(dev);
-		if ((np != 0) && (np->n_intrs > 0) && (np->intrs[0].line != 0))
-			dev->irq = np->intrs[0].line;
-		pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
-	}
+	for_each_pci_dev(dev)
+		pci_read_irq_line(dev);
 }
 
 #define PRG_CL_RESET_VALID 0x00010000
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c
index a5dffc8d93a9..bb10171132fa 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -59,7 +59,7 @@ void rtas_indicator_progress(char *, unsigned short);
 int _chrp_type;
 EXPORT_SYMBOL(_chrp_type);
 
-struct mpic *chrp_mpic;
+static struct mpic *chrp_mpic;
 
 /* Used for doing CHRP event-scans */
 DEFINE_PER_CPU(struct timer_list, heartbeat_timer);
@@ -315,19 +315,13 @@ chrp_event_scan(unsigned long unused)
 		  jiffies + event_scan_interval);
 }
 
-void chrp_8259_cascade(unsigned int irq, struct irq_desc *desc,
-			struct pt_regs *regs)
+static void chrp_8259_cascade(unsigned int irq, struct irq_desc *desc,
+			      struct pt_regs *regs)
 {
-	unsigned int max = 100;
-
-	while(max--) {
-		int irq = i8259_irq(regs);
-		if (max == 99)
-			desc->chip->eoi(irq);
-		if (irq < 0)
-			break;
-		generic_handle_irq(irq, regs);
-	};
+	unsigned int cascade_irq = i8259_irq(regs);
+	if (cascade_irq != NO_IRQ)
+		generic_handle_irq(cascade_irq, regs);
+	desc->chip->eoi(irq);
 }
 
 /*
@@ -336,18 +330,17 @@ void chrp_8259_cascade(unsigned int irq, struct irq_desc *desc,
 static void __init chrp_find_openpic(void)
 {
 	struct device_node *np, *root;
-	int len, i, j, irq_count;
+	int len, i, j;
 	int isu_size, idu_size;
 	unsigned int *iranges, *opprop = NULL;
 	int oplen = 0;
 	unsigned long opaddr;
 	int na = 1;
-	unsigned char init_senses[NR_IRQS - NUM_8259_INTERRUPTS];
 
-	np = find_type_devices("open-pic");
+	np = of_find_node_by_type(NULL, "open-pic");
 	if (np == NULL)
 		return;
-	root = find_path_device("/");
+	root = of_find_node_by_path("/");
 	if (root) {
 		opprop = (unsigned int *) get_property
 			(root, "platform-open-pic", &oplen);
@@ -358,19 +351,15 @@ static void __init chrp_find_openpic(void)
 		oplen /= na * sizeof(unsigned int);
 	} else {
 		struct resource r;
-		if (of_address_to_resource(np, 0, &r))
-			return;
+		if (of_address_to_resource(np, 0, &r)) {
+			goto bail;
+		}
 		opaddr = r.start;
 		oplen = 0;
 	}
 
 	printk(KERN_INFO "OpenPIC at %lx\n", opaddr);
 
-	irq_count = NR_IRQS - NUM_ISA_INTERRUPTS - 4; /* leave room for IPIs */
-	prom_get_irq_senses(init_senses, NUM_ISA_INTERRUPTS, NR_IRQS - 4);
-	/* i8259 cascade is always positive level */
-	init_senses[0] = IRQ_SENSE_LEVEL | IRQ_POLARITY_POSITIVE;
-
 	iranges = (unsigned int *) get_property(np, "interrupt-ranges", &len);
 	if (iranges == NULL)
 		len = 0;	/* non-distributed mpic */
@@ -397,15 +386,12 @@ static void __init chrp_find_openpic(void)
 	if (len > 1)
 		isu_size = iranges[3];
 
-	chrp_mpic = mpic_alloc(opaddr, MPIC_PRIMARY,
-			       isu_size, NUM_ISA_INTERRUPTS, irq_count,
-			       NR_IRQS - 4, init_senses, irq_count,
-			       " MPIC    ");
+	chrp_mpic = mpic_alloc(np, opaddr, MPIC_PRIMARY,
+			       isu_size, 0, " MPIC    ");
 	if (chrp_mpic == NULL) {
 		printk(KERN_ERR "Failed to allocate MPIC structure\n");
-		return;
+		goto bail;
 	}
-
 	j = na - 1;
 	for (i = 1; i < len; ++i) {
 		iranges += 2;
@@ -417,7 +403,10 @@ static void __init chrp_find_openpic(void)
 	}
 
 	mpic_init(chrp_mpic);
-	set_irq_chained_handler(NUM_ISA_INTERRUPTS, chrp_8259_cascade);
+	ppc_md.get_irq = mpic_get_irq;
+ bail:
+	of_node_put(root);
+	of_node_put(np);
 }
 
 #if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(XMON)
@@ -428,14 +417,34 @@ static struct irqaction xmon_irqaction = {
 };
 #endif
 
-void __init chrp_init_IRQ(void)
+static void __init chrp_find_8259(void)
 {
-	struct device_node *np;
+	struct device_node *np, *pic = NULL;
 	unsigned long chrp_int_ack = 0;
-#if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(XMON)
-	struct device_node *kbd;
-#endif
+	unsigned int cascade_irq;
 
+	/* Look for cascade */
+	for_each_node_by_type(np, "interrupt-controller")
+		if (device_is_compatible(np, "chrp,iic")) {
+			pic = np;
+			break;
+		}
+	/* Ok, 8259 wasn't found. We need to handle the case where
+	 * we have a pegasos that claims to be chrp but doesn't have
+	 * a proper interrupt tree
+	 */
+	if (pic == NULL && chrp_mpic != NULL) {
+		printk(KERN_ERR "i8259: Not found in device-tree"
+		       " assuming no legacy interrupts\n");
+		return;
+	}
+
+	/* Look for intack. In a perfect world, we would look for it on
+	 * the ISA bus that holds the 8259 but heh... Works that way. If
+	 * we ever see a problem, we can try to re-use the pSeries code here.
+	 * Also, Pegasos-type platforms don't have a proper node to start
+	 * from anyway
+	 */
 	for (np = find_devices("pci"); np != NULL; np = np->next) {
 		unsigned int *addrp = (unsigned int *)
 			get_property(np, "8259-interrupt-acknowledge", NULL);
@@ -446,11 +455,29 @@ void __init chrp_init_IRQ(void)
 		break;
 	}
 	if (np == NULL)
-		printk(KERN_ERR "Cannot find PCI interrupt acknowledge address\n");
+		printk(KERN_WARNING "Cannot find PCI interrupt acknowledge"
+		       " address, polling\n");
+
+	i8259_init(pic, chrp_int_ack);
+	if (ppc_md.get_irq == NULL)
+		ppc_md.get_irq = i8259_irq;
+	if (chrp_mpic != NULL) {
+		cascade_irq = irq_of_parse_and_map(pic, 0);
+		if (cascade_irq == NO_IRQ)
+			printk(KERN_ERR "i8259: failed to map cascade irq\n");
+		else
+			set_irq_chained_handler(cascade_irq,
+						chrp_8259_cascade);
+	}
+}
 
+void __init chrp_init_IRQ(void)
+{
+#if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(XMON)
+	struct device_node *kbd;
+#endif
 	chrp_find_openpic();
-
-	i8259_init(chrp_int_ack, 0);
+	chrp_find_8259();
 
 	if (_chrp_type == _CHRP_Pegasos)
 		ppc_md.get_irq        = i8259_irq;
@@ -535,10 +562,6 @@ static int __init chrp_probe(void)
 	DMA_MODE_READ = 0x44;
 	DMA_MODE_WRITE = 0x48;
 	isa_io_base = CHRP_ISA_IO_BASE;		/* default value */
-	ppc_do_canonicalize_irqs = 1;
-
-	/* Assume we have an 8259... */
-	__irq_offset_value = NUM_ISA_INTERRUPTS;
 
 	return 1;
 }
@@ -550,7 +573,6 @@ define_machine(chrp) {
 	.init			= chrp_init2,
 	.show_cpuinfo		= chrp_show_cpuinfo,
 	.init_IRQ		= chrp_init_IRQ,
-	.get_irq		= mpic_get_irq,
 	.pcibios_fixup		= chrp_pcibios_fixup,
 	.restart		= rtas_restart,
 	.power_off		= rtas_power_off,
diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c
index c298ca1ea680..1d2307e87c30 100644
--- a/arch/powerpc/platforms/chrp/smp.c
+++ b/arch/powerpc/platforms/chrp/smp.c
@@ -29,7 +29,6 @@
 #include <asm/smp.h>
 #include <asm/residual.h>
 #include <asm/time.h>
-#include <asm/open_pic.h>
 #include <asm/machdep.h>
 #include <asm/smp.h>
 #include <asm/mpic.h>
diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c
index 7fb6a08786c0..2275e64f3152 100644
--- a/arch/powerpc/platforms/iseries/irq.c
+++ b/arch/powerpc/platforms/iseries/irq.c
@@ -162,27 +162,6 @@ static void pci_event_handler(struct HvLpEvent *event, struct pt_regs *regs)
 		printk(KERN_ERR "pci_event_handler: NULL event received\n");
 }
 
-/*
- * This is called by init_IRQ.  set in ppc_md.init_IRQ by iSeries_setup.c
- * It must be called before the bus walk.
- */
-void __init iSeries_init_IRQ(void)
-{
-	/* Register PCI event handler and open an event path */
-	int ret;
-
-	ret = HvLpEvent_registerHandler(HvLpEvent_Type_PciIo,
-			&pci_event_handler);
-	if (ret == 0) {
-		ret = HvLpEvent_openPath(HvLpEvent_Type_PciIo, 0);
-		if (ret != 0)
-			printk(KERN_ERR "iseries_init_IRQ: open event path "
-					"failed with rc 0x%x\n", ret);
-	} else
-		printk(KERN_ERR "iseries_init_IRQ: register handler "
-				"failed with rc 0x%x\n", ret);
-}
-
 #define REAL_IRQ_TO_SUBBUS(irq)	(((irq) >> 14) & 0xff)
 #define REAL_IRQ_TO_BUS(irq)	((((irq) >> 6) & 0xff) + 1)
 #define REAL_IRQ_TO_IDSEL(irq)	((((irq) >> 3) & 7) + 1)
@@ -196,7 +175,7 @@ static void iseries_enable_IRQ(unsigned int irq)
 {
 	u32 bus, dev_id, function, mask;
 	const u32 sub_bus = 0;
-	unsigned int rirq = virt_irq_to_real_map[irq];
+	unsigned int rirq = (unsigned int)irq_map[irq].hwirq;
 
 	/* The IRQ has already been locked by the caller */
 	bus = REAL_IRQ_TO_BUS(rirq);
@@ -213,7 +192,7 @@ static unsigned int iseries_startup_IRQ(unsigned int irq)
 {
 	u32 bus, dev_id, function, mask;
 	const u32 sub_bus = 0;
-	unsigned int rirq = virt_irq_to_real_map[irq];
+	unsigned int rirq = (unsigned int)irq_map[irq].hwirq;
 
 	bus = REAL_IRQ_TO_BUS(rirq);
 	function = REAL_IRQ_TO_FUNC(rirq);
@@ -254,7 +233,7 @@ static void iseries_shutdown_IRQ(unsigned int irq)
 {
 	u32 bus, dev_id, function, mask;
 	const u32 sub_bus = 0;
-	unsigned int rirq = virt_irq_to_real_map[irq];
+	unsigned int rirq = (unsigned int)irq_map[irq].hwirq;
 
 	/* irq should be locked by the caller */
 	bus = REAL_IRQ_TO_BUS(rirq);
@@ -277,7 +256,7 @@ static void iseries_disable_IRQ(unsigned int irq)
 {
 	u32 bus, dev_id, function, mask;
 	const u32 sub_bus = 0;
-	unsigned int rirq = virt_irq_to_real_map[irq];
+	unsigned int rirq = (unsigned int)irq_map[irq].hwirq;
 
 	/* The IRQ has already been locked by the caller */
 	bus = REAL_IRQ_TO_BUS(rirq);
@@ -291,7 +270,7 @@ static void iseries_disable_IRQ(unsigned int irq)
 
 static void iseries_end_IRQ(unsigned int irq)
 {
-	unsigned int rirq = virt_irq_to_real_map[irq];
+	unsigned int rirq = (unsigned int)irq_map[irq].hwirq;
 
 	HvCallPci_eoi(REAL_IRQ_TO_BUS(rirq), REAL_IRQ_TO_SUBBUS(rirq),
 		(REAL_IRQ_TO_IDSEL(rirq) << 4) + REAL_IRQ_TO_FUNC(rirq));
@@ -314,16 +293,14 @@ static struct irq_chip iseries_pic = {
 int __init iSeries_allocate_IRQ(HvBusNumber bus,
 		HvSubBusNumber sub_bus, u32 bsubbus)
 {
-	int virtirq;
 	unsigned int realirq;
 	u8 idsel = ISERIES_GET_DEVICE_FROM_SUBBUS(bsubbus);
 	u8 function = ISERIES_GET_FUNCTION_FROM_SUBBUS(bsubbus);
 
 	realirq = (((((sub_bus << 8) + (bus - 1)) << 3) + (idsel - 1)) << 3)
 		+ function;
-	virtirq = virt_irq_create_mapping(realirq);
-	set_irq_chip_and_handler(virtirq, &iseries_pic, handle_fasteoi_irq);
-	return virtirq;
+
+	return irq_create_mapping(NULL, realirq, IRQ_TYPE_NONE);
 }
 
 #endif /* CONFIG_PCI */
@@ -331,10 +308,9 @@ int __init iSeries_allocate_IRQ(HvBusNumber bus,
 /*
  * Get the next pending IRQ.
  */
-int iSeries_get_irq(struct pt_regs *regs)
+unsigned int iSeries_get_irq(struct pt_regs *regs)
 {
-	/* -2 means ignore this interrupt */
-	int irq = -2;
+	int irq = NO_IRQ_IGNORE;
 
 #ifdef CONFIG_SMP
 	if (get_lppaca()->int_dword.fields.ipi_cnt) {
@@ -357,9 +333,57 @@ int iSeries_get_irq(struct pt_regs *regs)
 		}
 		spin_unlock(&pending_irqs_lock);
 		if (irq >= NR_IRQS)
-			irq = -2;
+			irq = NO_IRQ_IGNORE;
 	}
 #endif
 
 	return irq;
 }
+
+static int iseries_irq_host_map(struct irq_host *h, unsigned int virq,
+				irq_hw_number_t hw, unsigned int flags)
+{
+	set_irq_chip_and_handler(virq, &iseries_pic, handle_fasteoi_irq);
+
+	return 0;
+}
+
+static struct irq_host_ops iseries_irq_host_ops = {
+	.map = iseries_irq_host_map,
+};
+
+/*
+ * This is called by init_IRQ.  set in ppc_md.init_IRQ by iSeries_setup.c
+ * It must be called before the bus walk.
+ */
+void __init iSeries_init_IRQ(void)
+{
+	/* Register PCI event handler and open an event path */
+	struct irq_host *host;
+	int ret;
+
+	/*
+	 * The Hypervisor only allows us up to 256 interrupt
+	 * sources (the irq number is passed in a u8).
+	 */
+	irq_set_virq_count(256);
+
+	/* Create irq host. No need for a revmap since HV will give us
+	 * back our virtual irq number
+	 */
+	host = irq_alloc_host(IRQ_HOST_MAP_NOMAP, 0, &iseries_irq_host_ops, 0);
+	BUG_ON(host == NULL);
+	irq_set_default_host(host);
+
+	ret = HvLpEvent_registerHandler(HvLpEvent_Type_PciIo,
+			&pci_event_handler);
+	if (ret == 0) {
+		ret = HvLpEvent_openPath(HvLpEvent_Type_PciIo, 0);
+		if (ret != 0)
+			printk(KERN_ERR "iseries_init_IRQ: open event path "
+					"failed with rc 0x%x\n", ret);
+	} else
+		printk(KERN_ERR "iseries_init_IRQ: register handler "
+				"failed with rc 0x%x\n", ret);
+}
+
diff --git a/arch/powerpc/platforms/iseries/irq.h b/arch/powerpc/platforms/iseries/irq.h
index 188aa808abd7..1ee8985140e5 100644
--- a/arch/powerpc/platforms/iseries/irq.h
+++ b/arch/powerpc/platforms/iseries/irq.h
@@ -4,6 +4,6 @@
 extern void iSeries_init_IRQ(void);
 extern int  iSeries_allocate_IRQ(HvBusNumber, HvSubBusNumber, u32);
 extern void iSeries_activate_IRQs(void);
-extern int iSeries_get_irq(struct pt_regs *);
+extern unsigned int iSeries_get_irq(struct pt_regs *);
 
 #endif /* _ISERIES_IRQ_H */
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index c877074745b2..c9605d773a77 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -294,8 +294,6 @@ static void __init iSeries_init_early(void)
 {
 	DBG(" -> iSeries_init_early()\n");
 
-	ppc64_interrupt_controller = IC_ISERIES;
-
 #if defined(CONFIG_BLK_DEV_INITRD)
 	/*
 	 * If the init RAM disk has been configured and there is
@@ -659,12 +657,6 @@ static int __init iseries_probe(void)
 	powerpc_firmware_features |= FW_FEATURE_ISERIES;
 	powerpc_firmware_features |= FW_FEATURE_LPAR;
 
-	/*
-	 * The Hypervisor only allows us up to 256 interrupt
-	 * sources (the irq number is passed in a u8).
-	 */
-	virt_irq_max = 255;
-
 	hpte_init_iSeries();
 
 	return 1;
diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c
index f7170ff86dab..63a1670d3bfd 100644
--- a/arch/powerpc/platforms/maple/pci.c
+++ b/arch/powerpc/platforms/maple/pci.c
@@ -443,18 +443,23 @@ void __init maple_pci_init(void)
 int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel)
 {
 	struct device_node *np;
-	int irq = channel ? 15 : 14;
+	unsigned int defirq = channel ? 15 : 14;
+	unsigned int irq;
 
 	if (pdev->vendor != PCI_VENDOR_ID_AMD ||
 	    pdev->device != PCI_DEVICE_ID_AMD_8111_IDE)
-		return irq;
+		return defirq;
 
 	np = pci_device_to_OF_node(pdev);
 	if (np == NULL)
-		return irq;
-	if (np->n_intrs < 2)
-		return irq;
-	return np->intrs[channel & 0x1].line;
+		return defirq;
+	irq = irq_of_parse_and_map(np, channel & 0x1);
+	if (irq == NO_IRQ) {
+		printk("Failed to map onboard IDE interrupt for channel %d\n",
+		       channel);
+		return defirq;
+	}
+	return irq;
 }
 
 /* XXX: To remove once all firmwares are ok */
diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c
index 611ca8e979e5..cb528c9de4c3 100644
--- a/arch/powerpc/platforms/maple/setup.c
+++ b/arch/powerpc/platforms/maple/setup.c
@@ -198,50 +198,81 @@ static void __init maple_init_early(void)
 {
 	DBG(" -> maple_init_early\n");
 
-	/* Setup interrupt mapping options */
-	ppc64_interrupt_controller = IC_OPEN_PIC;
-
 	iommu_init_early_dart();
 
 	DBG(" <- maple_init_early\n");
 }
 
-
-static __init void maple_init_IRQ(void)
+/*
+ * This is almost identical to pSeries and CHRP. We need to make that
+ * code generic at one point, with appropriate bits in the device-tree to
+ * identify the presence of an HT APIC
+ */
+static void __init maple_init_IRQ(void)
 {
-	struct device_node *root;
+	struct device_node *root, *np, *mpic_node = NULL;
 	unsigned int *opprop;
-	unsigned long opic_addr;
+	unsigned long openpic_addr = 0;
+	int naddr, n, i, opplen, has_isus = 0;
 	struct mpic *mpic;
-	unsigned char senses[128];
-	int n;
+	unsigned int flags = MPIC_PRIMARY;
 
-	DBG(" -> maple_init_IRQ\n");
+	/* Locate MPIC in the device-tree. Note that there is a bug
+	 * in Maple device-tree where the type of the controller is
+	 * open-pic and not interrupt-controller
+	 */
+	for_each_node_by_type(np, "open-pic") {
+		mpic_node = np;
+		break;
+	}
+	if (mpic_node == NULL) {
+		printk(KERN_ERR
+		       "Failed to locate the MPIC interrupt controller\n");
+		return;
+	}
 
-	/* XXX: Non standard, replace that with a proper openpic/mpic node
-	 * in the device-tree. Find the Open PIC if present */
+	/* Find address list in /platform-open-pic */
 	root = of_find_node_by_path("/");
-	opprop = (unsigned int *) get_property(root,
-				"platform-open-pic", NULL);
-	if (opprop == 0)
-		panic("OpenPIC not found !\n");
-
-	n = prom_n_addr_cells(root);
-	for (opic_addr = 0; n > 0; --n)
-		opic_addr = (opic_addr << 32) + *opprop++;
+	naddr = prom_n_addr_cells(root);
+	opprop = (unsigned int *) get_property(root, "platform-open-pic",
+					       &opplen);
+	if (opprop != 0) {
+		openpic_addr = of_read_number(opprop, naddr);
+		has_isus = (opplen > naddr);
+		printk(KERN_DEBUG "OpenPIC addr: %lx, has ISUs: %d\n",
+		       openpic_addr, has_isus);
+	}
 	of_node_put(root);
 
-	/* Obtain sense values from device-tree */
-	prom_get_irq_senses(senses, 0, 128);
+	BUG_ON(openpic_addr == 0);
 
-	mpic = mpic_alloc(opic_addr,
-			  MPIC_PRIMARY | MPIC_BIG_ENDIAN |
-			  MPIC_BROKEN_U3 | MPIC_WANTS_RESET,
-			  0, 0, 128, 128, senses, 128, "U3-MPIC");
+	/* Check for a big endian MPIC */
+	if (get_property(np, "big-endian", NULL) != NULL)
+		flags |= MPIC_BIG_ENDIAN;
+
+	/* XXX Maple specific bits */
+	flags |= MPIC_BROKEN_U3 | MPIC_WANTS_RESET;
+
+	/* Setup the openpic driver. More device-tree junks, we hard code no
+	 * ISUs for now. I'll have to revisit some stuffs with the folks doing
+	 * the firmware for those
+	 */
+	mpic = mpic_alloc(mpic_node, openpic_addr, flags,
+			  /*has_isus ? 16 :*/ 0, 0, " MPIC     ");
 	BUG_ON(mpic == NULL);
-	mpic_init(mpic);
 
-	DBG(" <- maple_init_IRQ\n");
+	/* Add ISUs */
+	opplen /= sizeof(u32);
+	for (n = 0, i = naddr; i < opplen; i += naddr, n++) {
+		unsigned long isuaddr = of_read_number(opprop + i, naddr);
+		mpic_assign_isu(mpic, n, isuaddr);
+	}
+
+	/* All ISUs are setup, complete initialization */
+	mpic_init(mpic);
+	ppc_md.get_irq = mpic_get_irq;
+	of_node_put(mpic_node);
+	of_node_put(root);
 }
 
 static void __init maple_progress(char *s, unsigned short hex)
@@ -279,7 +310,6 @@ define_machine(maple_md) {
 	.setup_arch		= maple_setup_arch,
 	.init_early		= maple_init_early,
 	.init_IRQ		= maple_init_IRQ,
-	.get_irq		= mpic_get_irq,
 	.pcibios_fixup		= maple_pcibios_fixup,
 	.pci_get_legacy_ide_irq	= maple_pci_get_legacy_ide_irq,
 	.restart		= maple_restart,
diff --git a/arch/powerpc/platforms/powermac/bootx_init.c b/arch/powerpc/platforms/powermac/bootx_init.c
index 5685ad9e88e8..e63d52f227ee 100644
--- a/arch/powerpc/platforms/powermac/bootx_init.c
+++ b/arch/powerpc/platforms/powermac/bootx_init.c
@@ -162,6 +162,8 @@ static void __init bootx_add_chosen_props(unsigned long base,
 {
 	u32 val;
 
+	bootx_dt_add_prop("linux,bootx", NULL, 0, mem_end);
+
 	if (bootx_info->kernelParamsOffset) {
 		char *args = (char *)((unsigned long)bootx_info) +
 			bootx_info->kernelParamsOffset;
@@ -228,7 +230,7 @@ static void __init bootx_scan_dt_build_strings(unsigned long base,
 
 	if (!strcmp(namep, "/chosen")) {
 		DBG(" detected /chosen ! adding properties names !\n");
-		bootx_dt_add_string("linux,platform", mem_end);
+		bootx_dt_add_string("linux,bootx", mem_end);
 		bootx_dt_add_string("linux,stdout-path", mem_end);
 		bootx_dt_add_string("linux,initrd-start", mem_end);
 		bootx_dt_add_string("linux,initrd-end", mem_end);
diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index ceafaf52a668..8677f50c2586 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -522,10 +522,11 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np)
 		host->speed = KW_I2C_MODE_25KHZ;
 		break;
 	}	
-	if (np->n_intrs > 0)
-		host->irq = np->intrs[0].line;
-	else
-		host->irq = NO_IRQ;
+	host->irq = irq_of_parse_and_map(np, 0);
+	if (host->irq == NO_IRQ)
+		printk(KERN_WARNING
+		       "low_i2c: Failed to map interrupt for %s\n",
+		       np->full_name);
 
 	host->base = ioremap((*addrp), 0x1000);
 	if (host->base == NULL) {
diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c
index 41fa2409482a..6a36ea9bf673 100644
--- a/arch/powerpc/platforms/powermac/nvram.c
+++ b/arch/powerpc/platforms/powermac/nvram.c
@@ -29,6 +29,8 @@
 #include <asm/machdep.h>
 #include <asm/nvram.h>
 
+#include "pmac.h"
+
 #define DEBUG
 
 #ifdef DEBUG
@@ -80,9 +82,6 @@ static int nvram_partitions[3];
 // XXX Turn that into a sem
 static DEFINE_SPINLOCK(nv_lock);
 
-extern int pmac_newworld;
-extern int system_running;
-
 static int (*core99_write_bank)(int bank, u8* datas);
 static int (*core99_erase_bank)(int bank);
 
diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index d524a915aa86..556b349797e8 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -46,6 +46,9 @@ static int has_uninorth;
 static struct pci_controller *u3_agp;
 static struct pci_controller *u4_pcie;
 static struct pci_controller *u3_ht;
+#define has_second_ohare 0
+#else
+static int has_second_ohare;
 #endif /* CONFIG_PPC64 */
 
 extern u8 pci_cache_line_size;
@@ -647,6 +650,33 @@ static void __init init_p2pbridge(void)
 	early_write_config_word(hose, bus, devfn, PCI_BRIDGE_CONTROL, val);
 }
 
+static void __init init_second_ohare(void)
+{
+	struct device_node *np = of_find_node_by_name(NULL, "pci106b,7");
+	unsigned char bus, devfn;
+	unsigned short cmd;
+
+	if (np == NULL)
+		return;
+
+	/* This must run before we initialize the PICs since the second
+	 * ohare hosts a PIC that will be accessed there.
+	 */
+	if (pci_device_from_OF_node(np, &bus, &devfn) == 0) {
+		struct pci_controller* hose =
+			pci_find_hose_for_OF_device(np);
+		if (!hose) {
+			printk(KERN_ERR "Can't find PCI hose for OHare2 !\n");
+			return;
+		}
+		early_read_config_word(hose, bus, devfn, PCI_COMMAND, &cmd);
+		cmd |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+		cmd &= ~PCI_COMMAND_IO;
+		early_write_config_word(hose, bus, devfn, PCI_COMMAND, cmd);
+	}
+	has_second_ohare = 1;
+}
+
 /*
  * Some Apple desktop machines have a NEC PD720100A USB2 controller
  * on the motherboard. Open Firmware, on these, will disable the
@@ -688,9 +718,6 @@ static void __init fixup_nec_usb2(void)
 			       " EHCI, fixing up...\n");
 			data &= ~1UL;
 			early_write_config_dword(hose, bus, devfn, 0xe4, data);
-			early_write_config_byte(hose, bus,
-						devfn | 2, PCI_INTERRUPT_LINE,
-				nec->intrs[0].line);
 		}
 	}
 }
@@ -958,32 +985,28 @@ static int __init add_bridge(struct device_node *dev)
 	return 0;
 }
 
-static void __init pcibios_fixup_OF_interrupts(void)
+void __init pmac_pcibios_fixup(void)
 {
 	struct pci_dev* dev = NULL;
 
-	/*
-	 * Open Firmware often doesn't initialize the
-	 * PCI_INTERRUPT_LINE config register properly, so we
-	 * should find the device node and apply the interrupt
-	 * obtained from the OF device-tree
-	 */
 	for_each_pci_dev(dev) {
-		struct device_node *node;
-		node = pci_device_to_OF_node(dev);
-		/* this is the node, see if it has interrupts */
-		if (node && node->n_intrs > 0)
-			dev->irq = node->intrs[0].line;
-		pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
+		/* Read interrupt from the device-tree */
+		pci_read_irq_line(dev);
+
+		/* Fixup interrupt for the modem/ethernet combo controller.
+		 * on machines with a second ohare chip.
+		 * The number in the device tree (27) is bogus (correct for
+		 * the ethernet-only board but not the combo ethernet/modem
+		 * board). The real interrupt is 28 on the second controller
+		 * -> 28+32 = 60.
+		 */
+		if (has_second_ohare &&
+		    dev->vendor == PCI_VENDOR_ID_DEC &&
+		    dev->device == PCI_DEVICE_ID_DEC_TULIP_PLUS)
+			dev->irq = irq_create_mapping(NULL, 60, 0);
 	}
 }
 
-void __init pmac_pcibios_fixup(void)
-{
-	/* Fixup interrupts according to OF tree */
-	pcibios_fixup_OF_interrupts();
-}
-
 #ifdef CONFIG_PPC64
 static void __init pmac_fixup_phb_resources(void)
 {
@@ -1071,6 +1094,7 @@ void __init pmac_pci_init(void)
 
 #else /* CONFIG_PPC64 */
 	init_p2pbridge();
+	init_second_ohare();
 	fixup_nec_usb2();
 
 	/* We are still having some issues with the Xserve G4, enabling
diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c
index d6eab8b3f7de..6d66359ec8c8 100644
--- a/arch/powerpc/platforms/powermac/pfunc_base.c
+++ b/arch/powerpc/platforms/powermac/pfunc_base.c
@@ -24,19 +24,18 @@ static irqreturn_t macio_gpio_irq(int irq, void *data, struct pt_regs *regs)
 
 static int macio_do_gpio_irq_enable(struct pmf_function *func)
 {
-	if (func->node->n_intrs < 1)
+	unsigned int irq = irq_of_parse_and_map(func->node, 0);
+	if (irq == NO_IRQ)
 		return -EINVAL;
-
-	return request_irq(func->node->intrs[0].line, macio_gpio_irq, 0,
-			   func->node->name, func);
+	return request_irq(irq, macio_gpio_irq, 0, func->node->name, func);
 }
 
 static int macio_do_gpio_irq_disable(struct pmf_function *func)
 {
-	if (func->node->n_intrs < 1)
+	unsigned int irq = irq_of_parse_and_map(func->node, 0);
+	if (irq == NO_IRQ)
 		return -EINVAL;
-
-	free_irq(func->node->intrs[0].line, func);
+	free_irq(irq, func);
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 58a4c7b90b8b..3d328bc1f7e0 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -65,13 +65,11 @@ static u32 level_mask[4];
 
 static DEFINE_SPINLOCK(pmac_pic_lock);
 
-#define GATWICK_IRQ_POOL_SIZE        10
-static struct interrupt_info gatwick_int_pool[GATWICK_IRQ_POOL_SIZE];
-
 #define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
 static unsigned long ppc_lost_interrupts[NR_MASK_WORDS];
 static unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
 static int pmac_irq_cascade = -1;
+static struct irq_host *pmac_pic_host;
 
 static void __pmac_retrigger(unsigned int irq_nr)
 {
@@ -86,18 +84,16 @@ static void __pmac_retrigger(unsigned int irq_nr)
 	}
 }
 
-static void pmac_mask_and_ack_irq(unsigned int irq_nr)
+static void pmac_mask_and_ack_irq(unsigned int virq)
 {
-        unsigned long bit = 1UL << (irq_nr & 0x1f);
-        int i = irq_nr >> 5;
+	unsigned int src = irq_map[virq].hwirq;
+        unsigned long bit = 1UL << (virq & 0x1f);
+        int i = virq >> 5;
         unsigned long flags;
 
-        if ((unsigned)irq_nr >= max_irqs)
-                return;
-
 	spin_lock_irqsave(&pmac_pic_lock, flags);
-        __clear_bit(irq_nr, ppc_cached_irq_mask);
-        if (__test_and_clear_bit(irq_nr, ppc_lost_interrupts))
+        __clear_bit(src, ppc_cached_irq_mask);
+        if (__test_and_clear_bit(src, ppc_lost_interrupts))
                 atomic_dec(&ppc_n_lost_interrupts);
         out_le32(&pmac_irq_hw[i]->enable, ppc_cached_irq_mask[i]);
         out_le32(&pmac_irq_hw[i]->ack, bit);
@@ -110,17 +106,15 @@ static void pmac_mask_and_ack_irq(unsigned int irq_nr)
 	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 }
 
-static void pmac_ack_irq(unsigned int irq_nr)
+static void pmac_ack_irq(unsigned int virq)
 {
-        unsigned long bit = 1UL << (irq_nr & 0x1f);
-        int i = irq_nr >> 5;
+	unsigned int src = irq_map[virq].hwirq;
+        unsigned long bit = 1UL << (src & 0x1f);
+        int i = src >> 5;
         unsigned long flags;
 
-        if ((unsigned)irq_nr >= max_irqs)
-                return;
-
   	spin_lock_irqsave(&pmac_pic_lock, flags);
-	if (__test_and_clear_bit(irq_nr, ppc_lost_interrupts))
+	if (__test_and_clear_bit(src, ppc_lost_interrupts))
                 atomic_dec(&ppc_n_lost_interrupts);
         out_le32(&pmac_irq_hw[i]->ack, bit);
         (void)in_le32(&pmac_irq_hw[i]->ack);
@@ -157,48 +151,51 @@ static void __pmac_set_irq_mask(unsigned int irq_nr, int nokicklost)
 /* When an irq gets requested for the first client, if it's an
  * edge interrupt, we clear any previous one on the controller
  */
-static unsigned int pmac_startup_irq(unsigned int irq_nr)
+static unsigned int pmac_startup_irq(unsigned int virq)
 {
 	unsigned long flags;
-        unsigned long bit = 1UL << (irq_nr & 0x1f);
-        int i = irq_nr >> 5;
+	unsigned int src = irq_map[virq].hwirq;
+        unsigned long bit = 1UL << (src & 0x1f);
+        int i = src >> 5;
 
   	spin_lock_irqsave(&pmac_pic_lock, flags);
-	if ((irq_desc[irq_nr].status & IRQ_LEVEL) == 0)
+	if ((irq_desc[virq].status & IRQ_LEVEL) == 0)
 		out_le32(&pmac_irq_hw[i]->ack, bit);
-        __set_bit(irq_nr, ppc_cached_irq_mask);
-        __pmac_set_irq_mask(irq_nr, 0);
+        __set_bit(src, ppc_cached_irq_mask);
+        __pmac_set_irq_mask(src, 0);
   	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 
 	return 0;
 }
 
-static void pmac_mask_irq(unsigned int irq_nr)
+static void pmac_mask_irq(unsigned int virq)
 {
 	unsigned long flags;
+	unsigned int src = irq_map[virq].hwirq;
 
   	spin_lock_irqsave(&pmac_pic_lock, flags);
-        __clear_bit(irq_nr, ppc_cached_irq_mask);
-        __pmac_set_irq_mask(irq_nr, 0);
+        __clear_bit(src, ppc_cached_irq_mask);
+        __pmac_set_irq_mask(src, 0);
   	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 }
 
-static void pmac_unmask_irq(unsigned int irq_nr)
+static void pmac_unmask_irq(unsigned int virq)
 {
 	unsigned long flags;
+	unsigned int src = irq_map[virq].hwirq;
 
 	spin_lock_irqsave(&pmac_pic_lock, flags);
-	__set_bit(irq_nr, ppc_cached_irq_mask);
-        __pmac_set_irq_mask(irq_nr, 0);
+	__set_bit(src, ppc_cached_irq_mask);
+        __pmac_set_irq_mask(src, 0);
   	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 }
 
-static int pmac_retrigger(unsigned int irq_nr)
+static int pmac_retrigger(unsigned int virq)
 {
 	unsigned long flags;
 
   	spin_lock_irqsave(&pmac_pic_lock, flags);
-	__pmac_retrigger(irq_nr);
+	__pmac_retrigger(irq_map[virq].hwirq);
   	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 	return 1;
 }
@@ -238,7 +235,7 @@ static irqreturn_t gatwick_action(int cpl, void *dev_id, struct pt_regs *regs)
 	return rc;
 }
 
-static int pmac_get_irq(struct pt_regs *regs)
+static unsigned int pmac_pic_get_irq(struct pt_regs *regs)
 {
 	int irq;
 	unsigned long bits = 0;
@@ -250,7 +247,7 @@ static int pmac_get_irq(struct pt_regs *regs)
        	/* IPI's are a hack on the powersurge -- Cort */
        	if ( smp_processor_id() != 0 ) {
 		psurge_smp_message_recv(regs);
-		return -2;	/* ignore, already handled */
+		return NO_IRQ_IGNORE;	/* ignore, already handled */
         }
 #endif /* CONFIG_SMP */
   	spin_lock_irqsave(&pmac_pic_lock, flags);
@@ -266,133 +263,9 @@ static int pmac_get_irq(struct pt_regs *regs)
 		break;
 	}
   	spin_unlock_irqrestore(&pmac_pic_lock, flags);
-
-	return irq;
-}
-
-/* This routine will fix some missing interrupt values in the device tree
- * on the gatwick mac-io controller used by some PowerBooks
- *
- * Walking of OF nodes could use a bit more fixing up here, but it's not
- * very important as this is all boot time code on static portions of the
- * device-tree.
- *
- * However, the modifications done to "intrs" will have to be removed and
- * replaced with proper updates of the "interrupts" properties or
- * AAPL,interrupts, yet to be decided, once the dynamic parsing is there.
- */
-static void __init pmac_fix_gatwick_interrupts(struct device_node *gw,
-					       int irq_base)
-{
-	struct device_node *node;
-	int count;
-
-	memset(gatwick_int_pool, 0, sizeof(gatwick_int_pool));
-	count = 0;
-	for (node = NULL; (node = of_get_next_child(gw, node)) != NULL;) {
-		/* Fix SCC */
-		if ((strcasecmp(node->name, "escc") == 0) && node->child) {
-			if (node->child->n_intrs < 3) {
-				node->child->intrs = &gatwick_int_pool[count];
-				count += 3;
-			}
-			node->child->n_intrs = 3;
-			node->child->intrs[0].line = 15+irq_base;
-			node->child->intrs[1].line =  4+irq_base;
-			node->child->intrs[2].line =  5+irq_base;
-			printk(KERN_INFO "irq: fixed SCC on gatwick"
-			       " (%d,%d,%d)\n",
-			       node->child->intrs[0].line,
-			       node->child->intrs[1].line,
-			       node->child->intrs[2].line);
-		}
-		/* Fix media-bay & left SWIM */
-		if (strcasecmp(node->name, "media-bay") == 0) {
-			struct device_node* ya_node;
-
-			if (node->n_intrs == 0)
-				node->intrs = &gatwick_int_pool[count++];
-			node->n_intrs = 1;
-			node->intrs[0].line = 29+irq_base;
-			printk(KERN_INFO "irq: fixed media-bay on gatwick"
-			       " (%d)\n", node->intrs[0].line);
-
-			ya_node = node->child;
-			while(ya_node) {
-				if (strcasecmp(ya_node->name, "floppy") == 0) {
-					if (ya_node->n_intrs < 2) {
-						ya_node->intrs = &gatwick_int_pool[count];
-						count += 2;
-					}
-					ya_node->n_intrs = 2;
-					ya_node->intrs[0].line = 19+irq_base;
-					ya_node->intrs[1].line =  1+irq_base;
-					printk(KERN_INFO "irq: fixed floppy on second controller (%d,%d)\n",
-						ya_node->intrs[0].line, ya_node->intrs[1].line);
-				}
-				if (strcasecmp(ya_node->name, "ata4") == 0) {
-					if (ya_node->n_intrs < 2) {
-						ya_node->intrs = &gatwick_int_pool[count];
-						count += 2;
-					}
-					ya_node->n_intrs = 2;
-					ya_node->intrs[0].line = 14+irq_base;
-					ya_node->intrs[1].line =  3+irq_base;
-					printk(KERN_INFO "irq: fixed ide on second controller (%d,%d)\n",
-						ya_node->intrs[0].line, ya_node->intrs[1].line);
-				}
-				ya_node = ya_node->sibling;
-			}
-		}
-	}
-	if (count > 10) {
-		printk("WARNING !! Gatwick interrupt pool overflow\n");
-		printk("  GATWICK_IRQ_POOL_SIZE = %d\n", GATWICK_IRQ_POOL_SIZE);
-		printk("              requested = %d\n", count);
-	}
-}
-
-/*
- * The PowerBook 3400/2400/3500 can have a combo ethernet/modem
- * card which includes an ohare chip that acts as a second interrupt
- * controller.  If we find this second ohare, set it up and fix the
- * interrupt value in the device tree for the ethernet chip.
- */
-static void __init enable_second_ohare(struct device_node *np)
-{
-	unsigned char bus, devfn;
-	unsigned short cmd;
-	struct device_node *ether;
-
-	/* This code doesn't strictly belong here, it could be part of
-	 * either the PCI initialisation or the feature code. It's kept
-	 * here for historical reasons.
-	 */
-	if (pci_device_from_OF_node(np, &bus, &devfn) == 0) {
-		struct pci_controller* hose =
-			pci_find_hose_for_OF_device(np);
-		if (!hose) {
-			printk(KERN_ERR "Can't find PCI hose for OHare2 !\n");
-			return;
-		}
-		early_read_config_word(hose, bus, devfn, PCI_COMMAND, &cmd);
-		cmd |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
-		cmd &= ~PCI_COMMAND_IO;
-		early_write_config_word(hose, bus, devfn, PCI_COMMAND, cmd);
-	}
-
-	/* Fix interrupt for the modem/ethernet combo controller. The number
-	 * in the device tree (27) is bogus (correct for the ethernet-only
-	 * board but not the combo ethernet/modem board).
-	 * The real interrupt is 28 on the second controller -> 28+32 = 60.
-	 */
-	ether = of_find_node_by_name(NULL, "pci1011,14");
-	if (ether && ether->n_intrs > 0) {
-		ether->intrs[0].line = 60;
-		printk(KERN_INFO "irq: Fixed ethernet IRQ to %d\n",
-		       ether->intrs[0].line);
-	}
-	of_node_put(ether);
+	if (unlikely(irq < 0))
+		return NO_IRQ;
+	return irq_linear_revmap(pmac_pic_host, irq);
 }
 
 #ifdef CONFIG_XMON
@@ -411,6 +284,50 @@ static struct irqaction gatwick_cascade_action = {
 	.name		= "cascade",
 };
 
+static int pmac_pic_host_match(struct irq_host *h, struct device_node *node)
+{
+	/* We match all, we don't always have a node anyway */
+	return 1;
+}
+
+static int pmac_pic_host_map(struct irq_host *h, unsigned int virq,
+			     irq_hw_number_t hw, unsigned int flags)
+{
+	struct irq_desc *desc = get_irq_desc(virq);
+	int level;
+
+	if (hw >= max_irqs)
+		return -EINVAL;
+
+	/* Mark level interrupts, set delayed disable for edge ones and set
+	 * handlers
+	 */
+	level = !!(level_mask[hw >> 5] & (1UL << (hw & 0x1f)));
+	if (level)
+		desc->status |= IRQ_LEVEL;
+	else
+		desc->status |= IRQ_DELAYED_DISABLE;
+	set_irq_chip_and_handler(virq, &pmac_pic, level ?
+				 handle_level_irq : handle_edge_irq);
+	return 0;
+}
+
+static int pmac_pic_host_xlate(struct irq_host *h, struct device_node *ct,
+			       u32 *intspec, unsigned int intsize,
+			       irq_hw_number_t *out_hwirq,
+			       unsigned int *out_flags)
+
+{
+	*out_hwirq = *intspec;
+	return 0;
+}
+
+static struct irq_host_ops pmac_pic_host_ops = {
+	.match = pmac_pic_host_match,
+	.map = pmac_pic_host_map,
+	.xlate = pmac_pic_host_xlate,
+};
+
 static void __init pmac_pic_probe_oldstyle(void)
 {
         int i;
@@ -420,7 +337,7 @@ static void __init pmac_pic_probe_oldstyle(void)
 	struct resource r;
 
 	/* Set our get_irq function */
-	ppc_md.get_irq = pmac_get_irq;
+	ppc_md.get_irq = pmac_pic_get_irq;
 
 	/*
 	 * Find the interrupt controller type & node
@@ -438,7 +355,6 @@ static void __init pmac_pic_probe_oldstyle(void)
 		if (slave) {
 			max_irqs = 64;
 			level_mask[1] = OHARE_LEVEL_MASK;
-			enable_second_ohare(slave);
 		}
 	} else if ((master = of_find_node_by_name(NULL, "mac-io")) != NULL) {
 		max_irqs = max_real_irqs = 64;
@@ -462,21 +378,18 @@ static void __init pmac_pic_probe_oldstyle(void)
 			max_irqs = 128;
 			level_mask[2] = HEATHROW_LEVEL_MASK;
 			level_mask[3] = 0;
-			pmac_fix_gatwick_interrupts(slave, max_real_irqs);
 		}
 	}
 	BUG_ON(master == NULL);
 
-	/* Mark level interrupts and set handlers */
-	for (i = 0; i < max_irqs; i++) {
-		int level = !!(level_mask[i >> 5] & (1UL << (i & 0x1f)));
-		if (level)
-			irq_desc[i].status |= IRQ_LEVEL;
-		else
-			irq_desc[i].status |= IRQ_DELAYED_DISABLE;
-		set_irq_chip_and_handler(i, &pmac_pic, level ?
-					 handle_level_irq : handle_edge_irq);
-	}
+	/*
+	 * Allocate an irq host
+	 */
+	pmac_pic_host = irq_alloc_host(IRQ_HOST_MAP_LINEAR, max_irqs,
+				       &pmac_pic_host_ops,
+				       max_irqs);
+	BUG_ON(pmac_pic_host == NULL);
+	irq_set_default_host(pmac_pic_host);
 
 	/* Get addresses of first controller if we have a node for it */
 	BUG_ON(of_address_to_resource(master, 0, &r));
@@ -503,7 +416,7 @@ static void __init pmac_pic_probe_oldstyle(void)
 			pmac_irq_hw[i++] =
 				(volatile struct pmac_irq_hw __iomem *)
 				(addr + 0x10);
-		pmac_irq_cascade = slave->intrs[0].line;
+		pmac_irq_cascade = irq_of_parse_and_map(slave, 0);
 
 		printk(KERN_INFO "irq: Found slave Apple PIC %s for %d irqs"
 		       " cascade: %d\n", slave->full_name,
@@ -516,12 +429,12 @@ static void __init pmac_pic_probe_oldstyle(void)
 		out_le32(&pmac_irq_hw[i]->enable, 0);
 
 	/* Hookup cascade irq */
-	if (slave)
+	if (slave && pmac_irq_cascade != NO_IRQ)
 		setup_irq(pmac_irq_cascade, &gatwick_cascade_action);
 
 	printk(KERN_INFO "irq: System has %d possible interrupts\n", max_irqs);
 #ifdef CONFIG_XMON
-	setup_irq(20, &xmon_action);
+	setup_irq(irq_create_mapping(NULL, 20, 0), &xmon_action);
 #endif
 }
 #endif /* CONFIG_PPC32 */
@@ -530,16 +443,11 @@ static void pmac_u3_cascade(unsigned int irq, struct irq_desc *desc,
 			    struct pt_regs *regs)
 {
 	struct mpic *mpic = desc->handler_data;
-	unsigned int max = 100;
 
-	while(max--) {
-		int cascade_irq = mpic_get_one_irq(mpic, regs);
-		if (max == 99)
-			desc->chip->eoi(irq);
-		if (irq < 0)
-			break;
+	unsigned int cascade_irq = mpic_get_one_irq(mpic, regs);
+	if (cascade_irq != NO_IRQ)
 		generic_handle_irq(cascade_irq, regs);
-	};
+	desc->chip->eoi(irq);
 }
 
 static void __init pmac_pic_setup_mpic_nmi(struct mpic *mpic)
@@ -549,21 +457,20 @@ static void __init pmac_pic_setup_mpic_nmi(struct mpic *mpic)
 	int nmi_irq;
 
 	pswitch = of_find_node_by_name(NULL, "programmer-switch");
-	if (pswitch && pswitch->n_intrs) {
-		nmi_irq = pswitch->intrs[0].line;
-		mpic_irq_set_priority(nmi_irq, 9);
-		setup_irq(nmi_irq, &xmon_action);
+	if (pswitch) {
+		nmi_irq = irq_of_parse_and_map(pswitch, 0);
+		if (nmi_irq != NO_IRQ) {
+			mpic_irq_set_priority(nmi_irq, 9);
+			setup_irq(nmi_irq, &xmon_action);
+		}
+		of_node_put(pswitch);
 	}
-	of_node_put(pswitch);
 #endif	/* defined(CONFIG_XMON) && defined(CONFIG_PPC32) */
 }
 
 static struct mpic * __init pmac_setup_one_mpic(struct device_node *np,
 						int master)
 {
-	unsigned char senses[128];
-	int offset = master ? 0 : 128;
-	int count = master ? 128 : 124;
 	const char *name = master ? " MPIC 1   " : " MPIC 2   ";
 	struct resource r;
 	struct mpic *mpic;
@@ -576,8 +483,6 @@ static struct mpic * __init pmac_setup_one_mpic(struct device_node *np,
 
 	pmac_call_feature(PMAC_FTR_ENABLE_MPIC, np, 0, 0);
 
-	prom_get_irq_senses(senses, offset, offset + count);
-
 	flags |= MPIC_WANTS_RESET;
 	if (get_property(np, "big-endian", NULL))
 		flags |= MPIC_BIG_ENDIAN;
@@ -588,8 +493,7 @@ static struct mpic * __init pmac_setup_one_mpic(struct device_node *np,
 	if (master && (flags & MPIC_BIG_ENDIAN))
 		flags |= MPIC_BROKEN_U3;
 
-	mpic = mpic_alloc(r.start, flags, 0, offset, count, master ? 252 : 0,
-			  senses, count, name);
+	mpic = mpic_alloc(np, r.start, flags, 0, 0, name);
 	if (mpic == NULL)
 		return NULL;
 
@@ -602,6 +506,7 @@ static int __init pmac_pic_probe_mpic(void)
 {
 	struct mpic *mpic1, *mpic2;
 	struct device_node *np, *master = NULL, *slave = NULL;
+	unsigned int cascade;
 
 	/* We can have up to 2 MPICs cascaded */
 	for (np = NULL; (np = of_find_node_by_type(np, "open-pic"))
@@ -638,8 +543,15 @@ static int __init pmac_pic_probe_mpic(void)
 	of_node_put(master);
 
 	/* No slave, let's go out */
-	if (slave == NULL || slave->n_intrs < 1)
+	if (slave == NULL)
+		return 0;
+
+	/* Get/Map slave interrupt */
+	cascade = irq_of_parse_and_map(slave, 0);
+	if (cascade == NO_IRQ) {
+		printk(KERN_ERR "Failed to map cascade IRQ\n");
 		return 0;
+	}
 
 	mpic2 = pmac_setup_one_mpic(slave, 0);
 	if (mpic2 == NULL) {
@@ -647,8 +559,8 @@ static int __init pmac_pic_probe_mpic(void)
 		of_node_put(slave);
 		return 0;
 	}
-	set_irq_data(slave->intrs[0].line, mpic2);
-	set_irq_chained_handler(slave->intrs[0].line, pmac_u3_cascade);
+	set_irq_data(cascade, mpic2);
+	set_irq_chained_handler(cascade, pmac_u3_cascade);
 
 	of_node_put(slave);
 	return 0;
@@ -657,6 +569,19 @@ static int __init pmac_pic_probe_mpic(void)
 
 void __init pmac_pic_init(void)
 {
+	unsigned int flags = 0;
+
+	/* We configure the OF parsing based on our oldworld vs. newworld
+	 * platform type and wether we were booted by BootX.
+	 */
+#ifdef CONFIG_PPC32
+	if (!pmac_newworld)
+		flags |= OF_IMAP_OLDWORLD_MAC;
+	if (get_property(of_chosen, "linux,bootx", NULL) != NULL)
+		flags |= OF_IMAP_NO_PHANDLE;
+	of_irq_map_init(flags);
+#endif /* CONFIG_PPC_32 */
+
 	/* We first try to detect Apple's new Core99 chipset, since mac-io
 	 * is quite different on those machines and contains an IBM MPIC2.
 	 */
@@ -679,6 +604,7 @@ unsigned long sleep_save_mask[2];
 
 /* This used to be passed by the PMU driver but that link got
  * broken with the new driver model. We use this tweak for now...
+ * We really want to do things differently though...
  */
 static int pmacpic_find_viaint(void)
 {
@@ -692,7 +618,7 @@ static int pmacpic_find_viaint(void)
 	np = of_find_node_by_name(NULL, "via-pmu");
 	if (np == NULL)
 		goto not_found;
-	viaint = np->intrs[0].line;
+	viaint = irq_of_parse_and_map(np, 0);;
 #endif /* CONFIG_ADB_PMU */
 
 not_found:
diff --git a/arch/powerpc/platforms/powermac/pmac.h b/arch/powerpc/platforms/powermac/pmac.h
index 21c7b0f8f329..94e7b24b840b 100644
--- a/arch/powerpc/platforms/powermac/pmac.h
+++ b/arch/powerpc/platforms/powermac/pmac.h
@@ -12,6 +12,8 @@
 
 struct rtc_time;
 
+extern int pmac_newworld;
+
 extern long pmac_time_init(void);
 extern unsigned long pmac_get_boot_time(void);
 extern void pmac_get_rtc_time(struct rtc_time *);
diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index 8654b5f07836..31a9da769fa2 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -613,9 +613,6 @@ static void __init pmac_init_early(void)
 	udbg_adb_init(!!strstr(cmd_line, "btextdbg"));
 
 #ifdef CONFIG_PPC64
-	/* Setup interrupt mapping options */
-	ppc64_interrupt_controller = IC_OPEN_PIC;
-
 	iommu_init_early_dart();
 #endif
 }
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 9639c66b453d..9df783088b61 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -72,32 +72,62 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id,
 
 /* #define DEBUG */
 
-static void request_ras_irqs(struct device_node *np, char *propname,
+
+static void request_ras_irqs(struct device_node *np,
 			irqreturn_t (*handler)(int, void *, struct pt_regs *),
 			const char *name)
 {
-	unsigned int *ireg, len, i;
-	int virq, n_intr;
-
-	ireg = (unsigned int *)get_property(np, propname, &len);
-	if (ireg == NULL)
-		return;
-	n_intr = prom_n_intr_cells(np);
-	len /= n_intr * sizeof(*ireg);
-
-	for (i = 0; i < len; i++) {
-		virq = virt_irq_create_mapping(*ireg);
-		if (virq == NO_IRQ) {
-			printk(KERN_ERR "Unable to allocate interrupt "
-			       "number for %s\n", np->full_name);
-			return;
+	int i, index, count = 0;
+	struct of_irq oirq;
+	u32 *opicprop;
+	unsigned int opicplen;
+	unsigned int virqs[16];
+
+	/* Check for obsolete "open-pic-interrupt" property. If present, then
+	 * map those interrupts using the default interrupt host and default
+	 * trigger
+	 */
+	opicprop = (u32 *)get_property(np, "open-pic-interrupt", &opicplen);
+	if (opicprop) {
+		opicplen /= sizeof(u32);
+		for (i = 0; i < opicplen; i++) {
+			if (count > 15)
+				break;
+			virqs[count] = irq_create_mapping(NULL, *(opicprop++),
+							 IRQ_TYPE_NONE);
+			if (virqs[count] == NO_IRQ)
+				printk(KERN_ERR "Unable to allocate interrupt "
+				       "number for %s\n", np->full_name);
+			else
+				count++;
+
 		}
-		if (request_irq(irq_offset_up(virq), handler, 0, name, NULL)) {
+	}
+	/* Else use normal interrupt tree parsing */
+	else {
+		/* First try to do a proper OF tree parsing */
+		for (index = 0; of_irq_map_one(np, index, &oirq) == 0;
+		     index++) {
+			if (count > 15)
+				break;
+			virqs[count] = irq_create_of_mapping(oirq.controller,
+							    oirq.specifier,
+							    oirq.size);
+			if (virqs[count] == NO_IRQ)
+				printk(KERN_ERR "Unable to allocate interrupt "
+				       "number for %s\n", np->full_name);
+			else
+				count++;
+		}
+	}
+
+	/* Now request them */
+	for (i = 0; i < count; i++) {
+		if (request_irq(virqs[i], handler, 0, name, NULL)) {
 			printk(KERN_ERR "Unable to request interrupt %d for "
-			       "%s\n", irq_offset_up(virq), np->full_name);
+			       "%s\n", virqs[i], np->full_name);
 			return;
 		}
-		ireg += n_intr;
 	}
 }
 
@@ -115,20 +145,14 @@ static int __init init_ras_IRQ(void)
 	/* Internal Errors */
 	np = of_find_node_by_path("/event-sources/internal-errors");
 	if (np != NULL) {
-		request_ras_irqs(np, "open-pic-interrupt", ras_error_interrupt,
-				 "RAS_ERROR");
-		request_ras_irqs(np, "interrupts", ras_error_interrupt,
-				 "RAS_ERROR");
+		request_ras_irqs(np, ras_error_interrupt, "RAS_ERROR");
 		of_node_put(np);
 	}
 
 	/* EPOW Events */
 	np = of_find_node_by_path("/event-sources/epow-events");
 	if (np != NULL) {
-		request_ras_irqs(np, "open-pic-interrupt", ras_epow_interrupt,
-				 "RAS_EPOW");
-		request_ras_irqs(np, "interrupts", ras_epow_interrupt,
-				 "RAS_EPOW");
+		request_ras_irqs(np, ras_epow_interrupt, "RAS_EPOW");
 		of_node_put(np);
 	}
 
@@ -162,7 +186,7 @@ ras_epow_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 
 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
 			   RAS_VECTOR_OFFSET,
-			   virt_irq_to_real(irq_offset_down(irq)),
+			   irq_map[irq].hwirq,
 			   RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
 			   critical, __pa(&ras_log_buf),
 				rtas_get_error_log_max());
@@ -198,7 +222,7 @@ ras_error_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 
 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
 			   RAS_VECTOR_OFFSET,
-			   virt_irq_to_real(irq_offset_down(irq)),
+			   irq_map[irq].hwirq,
 			   RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
 			   __pa(&ras_log_buf),
 				rtas_get_error_log_max());
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 476b564a208b..54a52437265c 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -76,6 +76,9 @@
 #define DBG(fmt...)
 #endif
 
+/* move those away to a .h */
+extern void smp_init_pseries_mpic(void);
+extern void smp_init_pseries_xics(void);
 extern void find_udbg_vterm(void);
 
 int fwnmi_active;  /* TRUE if an FWNMI handler is present */
@@ -83,7 +86,7 @@ int fwnmi_active;  /* TRUE if an FWNMI handler is present */
 static void pseries_shared_idle_sleep(void);
 static void pseries_dedicated_idle_sleep(void);
 
-struct mpic *pSeries_mpic;
+static struct device_node *pSeries_mpic_node;
 
 static void pSeries_show_cpuinfo(struct seq_file *m)
 {
@@ -118,78 +121,92 @@ static void __init fwnmi_init(void)
 		fwnmi_active = 1;
 }
 
-void pSeries_8259_cascade(unsigned int irq, struct irq_desc *desc,
+void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc,
 			  struct pt_regs *regs)
 {
-	unsigned int max = 100;
-
-	while(max--) {
-		int cascade_irq = i8259_irq(regs);
-		if (max == 99)
-			desc->chip->eoi(irq);
-		if (cascade_irq < 0)
-			break;
+	unsigned int cascade_irq = i8259_irq(regs);
+	if (cascade_irq != NO_IRQ)
 		generic_handle_irq(cascade_irq, regs);
-	};
+	desc->chip->eoi(irq);
 }
 
-static void __init pSeries_init_mpic(void)
+static void __init pseries_mpic_init_IRQ(void)
 {
+	struct device_node *np, *old, *cascade = NULL;
         unsigned int *addrp;
-	struct device_node *np;
 	unsigned long intack = 0;
-
-	/* All ISUs are setup, complete initialization */
-	mpic_init(pSeries_mpic);
-
-	/* Check what kind of cascade ACK we have */
-        if (!(np = of_find_node_by_name(NULL, "pci"))
-            || !(addrp = (unsigned int *)
-                 get_property(np, "8259-interrupt-acknowledge", NULL)))
-                printk(KERN_ERR "Cannot find pci to get ack address\n");
-        else
-		intack = addrp[prom_n_addr_cells(np)-1];
-	of_node_put(np);
-
-	/* Setup the legacy interrupts & controller */
-	i8259_init(intack, 0);
-
-	/* Hook cascade to mpic */
-	set_irq_chained_handler(NUM_ISA_INTERRUPTS, pSeries_8259_cascade);
-}
-
-static void __init pSeries_setup_mpic(void)
-{
 	unsigned int *opprop;
 	unsigned long openpic_addr = 0;
-        unsigned char senses[NR_IRQS - NUM_ISA_INTERRUPTS];
-        struct device_node *root;
-	int irq_count;
+	unsigned int cascade_irq;
+	int naddr, n, i, opplen;
+	struct mpic *mpic;
 
-	/* Find the Open PIC if present */
-	root = of_find_node_by_path("/");
-	opprop = (unsigned int *) get_property(root, "platform-open-pic", NULL);
+	np = of_find_node_by_path("/");
+	naddr = prom_n_addr_cells(np);
+	opprop = (unsigned int *) get_property(np, "platform-open-pic", &opplen);
 	if (opprop != 0) {
-		int n = prom_n_addr_cells(root);
-
-		for (openpic_addr = 0; n > 0; --n)
-			openpic_addr = (openpic_addr << 32) + *opprop++;
+		openpic_addr = of_read_number(opprop, naddr);
 		printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
 	}
-	of_node_put(root);
+	of_node_put(np);
 
 	BUG_ON(openpic_addr == 0);
 
-	/* Get the sense values from OF */
-	prom_get_irq_senses(senses, NUM_ISA_INTERRUPTS, NR_IRQS);
-	
 	/* Setup the openpic driver */
-	irq_count = NR_IRQS - NUM_ISA_INTERRUPTS - 4; /* leave room for IPIs */
-	pSeries_mpic = mpic_alloc(openpic_addr, MPIC_PRIMARY,
-				  16, 16, irq_count, /* isu size, irq offset, irq count */ 
-				  NR_IRQS - 4, /* ipi offset */
-				  senses, irq_count, /* sense & sense size */
-				  " MPIC     ");
+	mpic = mpic_alloc(pSeries_mpic_node, openpic_addr,
+			  MPIC_PRIMARY,
+			  16, 250, /* isu size, irq count */
+			  " MPIC     ");
+	BUG_ON(mpic == NULL);
+
+	/* Add ISUs */
+	opplen /= sizeof(u32);
+	for (n = 0, i = naddr; i < opplen; i += naddr, n++) {
+		unsigned long isuaddr = of_read_number(opprop + i, naddr);
+		mpic_assign_isu(mpic, n, isuaddr);
+	}
+
+	/* All ISUs are setup, complete initialization */
+	mpic_init(mpic);
+
+	/* Look for cascade */
+	for_each_node_by_type(np, "interrupt-controller")
+		if (device_is_compatible(np, "chrp,iic")) {
+			cascade = np;
+			break;
+		}
+	if (cascade == NULL)
+		return;
+
+	cascade_irq = irq_of_parse_and_map(cascade, 0);
+	if (cascade == NO_IRQ) {
+		printk(KERN_ERR "xics: failed to map cascade interrupt");
+		return;
+	}
+
+	/* Check ACK type */
+	for (old = of_node_get(cascade); old != NULL ; old = np) {
+		np = of_get_parent(old);
+		of_node_put(old);
+		if (np == NULL)
+			break;
+		if (strcmp(np->name, "pci") != 0)
+			continue;
+		addrp = (u32 *)get_property(np, "8259-interrupt-acknowledge",
+					    NULL);
+		if (addrp == NULL)
+			continue;
+		naddr = prom_n_addr_cells(np);
+		intack = addrp[naddr-1];
+		if (naddr > 1)
+			intack |= ((unsigned long)addrp[naddr-2]) << 32;
+	}
+	if (intack)
+		printk(KERN_DEBUG "mpic: PCI 8259 intack at 0x%016lx\n",
+		       intack);
+	i8259_init(cascade, intack);
+	of_node_put(cascade);
+	set_irq_chained_handler(cascade_irq, pseries_8259_cascade);
 }
 
 static void pseries_lpar_enable_pmcs(void)
@@ -207,21 +224,67 @@ static void pseries_lpar_enable_pmcs(void)
 		get_lppaca()->pmcregs_in_use = 1;
 }
 
-static void __init pSeries_setup_arch(void)
+#ifdef CONFIG_KEXEC
+static void pseries_kexec_cpu_down_mpic(int crash_shutdown, int secondary)
 {
-	/* Fixup ppc_md depending on the type of interrupt controller */
-	if (ppc64_interrupt_controller == IC_OPEN_PIC) {
-		ppc_md.init_IRQ       = pSeries_init_mpic;
-		ppc_md.get_irq        = mpic_get_irq;
-		/* Allocate the mpic now, so that find_and_init_phbs() can
-		 * fill the ISUs */
-		pSeries_setup_mpic();
-	} else
-		ppc_md.init_IRQ       = xics_init_IRQ;
+	mpic_teardown_this_cpu(secondary);
+}
 
+static void pseries_kexec_cpu_down_xics(int crash_shutdown, int secondary)
+{
+	/* Don't risk a hypervisor call if we're crashing */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
+		unsigned long vpa = __pa(get_lppaca());
+
+		if (unregister_vpa(hard_smp_processor_id(), vpa)) {
+			printk("VPA deregistration of cpu %u (hw_cpu_id %d) "
+					"failed\n", smp_processor_id(),
+					hard_smp_processor_id());
+		}
+	}
+	xics_teardown_cpu(secondary);
+}
+#endif /* CONFIG_KEXEC */
+
+static void __init pseries_discover_pic(void)
+{
+	struct device_node *np;
+	char *typep;
+
+	for (np = NULL; (np = of_find_node_by_name(np,
+						   "interrupt-controller"));) {
+		typep = (char *)get_property(np, "compatible", NULL);
+		if (strstr(typep, "open-pic")) {
+			pSeries_mpic_node = of_node_get(np);
+			ppc_md.init_IRQ       = pseries_mpic_init_IRQ;
+			ppc_md.get_irq        = mpic_get_irq;
+#ifdef CONFIG_KEXEC
+			ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_mpic;
+#endif
 #ifdef CONFIG_SMP
-	smp_init_pSeries();
+			smp_init_pseries_mpic();
 #endif
+			return;
+		} else if (strstr(typep, "ppc-xicp")) {
+			ppc_md.init_IRQ       = xics_init_IRQ;
+#ifdef CONFIG_KEXEC
+			ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
+#endif
+#ifdef CONFIG_SMP
+			smp_init_pseries_xics();
+#endif
+			return;
+		}
+	}
+	printk(KERN_ERR "pSeries_discover_pic: failed to recognize"
+	       " interrupt-controller\n");
+}
+
+static void __init pSeries_setup_arch(void)
+{
+	/* Discover PIC type and setup ppc_md accordingly */
+	pseries_discover_pic();
+
 	/* openpic global configuration register (64-bit format). */
 	/* openpic Interrupt Source Unit pointer (64-bit format). */
 	/* python0 facility area (mmio) (64-bit format) REAL address. */
@@ -273,33 +336,6 @@ static int __init pSeries_init_panel(void)
 }
 arch_initcall(pSeries_init_panel);
 
-static  void __init pSeries_discover_pic(void)
-{
-	struct device_node *np;
-	char *typep;
-
-	/*
-	 * Setup interrupt mapping options that are needed for finish_device_tree
-	 * to properly parse the OF interrupt tree & do the virtual irq mapping
-	 */
-	__irq_offset_value = NUM_ISA_INTERRUPTS;
-	ppc64_interrupt_controller = IC_INVALID;
-	for (np = NULL; (np = of_find_node_by_name(np, "interrupt-controller"));) {
-		typep = (char *)get_property(np, "compatible", NULL);
-		if (strstr(typep, "open-pic")) {
-			ppc64_interrupt_controller = IC_OPEN_PIC;
-			break;
-		} else if (strstr(typep, "ppc-xicp")) {
-			ppc64_interrupt_controller = IC_PPC_XIC;
-			break;
-		}
-	}
-	if (ppc64_interrupt_controller == IC_INVALID)
-		printk("pSeries_discover_pic: failed to recognize"
-			" interrupt-controller\n");
-
-}
-
 static void pSeries_mach_cpu_die(void)
 {
 	local_irq_disable();
@@ -342,8 +378,6 @@ static void __init pSeries_init_early(void)
 
 	iommu_init_early_pSeries();
 
-	pSeries_discover_pic();
-
 	DBG(" <- pSeries_init_early()\n");
 }
 
@@ -515,27 +549,6 @@ static int pSeries_pci_probe_mode(struct pci_bus *bus)
 	return PCI_PROBE_NORMAL;
 }
 
-#ifdef CONFIG_KEXEC
-static void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
-{
-	/* Don't risk a hypervisor call if we're crashing */
-	if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
-		unsigned long vpa = __pa(get_lppaca());
-
-		if (unregister_vpa(hard_smp_processor_id(), vpa)) {
-			printk("VPA deregistration of cpu %u (hw_cpu_id %d) "
-					"failed\n", smp_processor_id(),
-					hard_smp_processor_id());
-		}
-	}
-
-	if (ppc64_interrupt_controller == IC_OPEN_PIC)
-		mpic_teardown_this_cpu(secondary);
-	else
-		xics_teardown_cpu(secondary);
-}
-#endif
-
 define_machine(pseries) {
 	.name			= "pSeries",
 	.probe			= pSeries_probe,
@@ -560,7 +573,6 @@ define_machine(pseries) {
 	.system_reset_exception = pSeries_system_reset_exception,
 	.machine_check_exception = pSeries_machine_check_exception,
 #ifdef CONFIG_KEXEC
-	.kexec_cpu_down		= pseries_kexec_cpu_down,
 	.machine_kexec		= default_machine_kexec,
 	.machine_kexec_prepare	= default_machine_kexec_prepare,
 	.machine_crash_shutdown	= default_machine_crash_shutdown,
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 4ad144df49c2..ac61098ff401 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -416,27 +416,12 @@ static struct smp_ops_t pSeries_xics_smp_ops = {
 #endif
 
 /* This is called very early */
-void __init smp_init_pSeries(void)
+static void __init smp_init_pseries(void)
 {
 	int i;
 
 	DBG(" -> smp_init_pSeries()\n");
 
-	switch (ppc64_interrupt_controller) {
-#ifdef CONFIG_MPIC
-	case IC_OPEN_PIC:
-		smp_ops = &pSeries_mpic_smp_ops;
-		break;
-#endif
-#ifdef CONFIG_XICS
-	case IC_PPC_XIC:
-		smp_ops = &pSeries_xics_smp_ops;
-		break;
-#endif
-	default:
-		panic("Invalid interrupt controller");
-	}
-
 #ifdef CONFIG_HOTPLUG_CPU
 	smp_ops->cpu_disable = pSeries_cpu_disable;
 	smp_ops->cpu_die = pSeries_cpu_die;
@@ -471,3 +456,18 @@ void __init smp_init_pSeries(void)
 	DBG(" <- smp_init_pSeries()\n");
 }
 
+#ifdef CONFIG_MPIC
+void __init smp_init_pseries_mpic(void)
+{
+	smp_ops = &pSeries_mpic_smp_ops;
+
+	smp_init_pseries();
+}
+#endif
+
+void __init smp_init_pseries_xics(void)
+{
+	smp_ops = &pSeries_xics_smp_ops;
+
+	smp_init_pseries();
+}
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index c7f04420066d..716972aa9777 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -8,6 +8,9 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  */
+
+#undef DEBUG
+
 #include <linux/types.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
@@ -19,6 +22,7 @@
 #include <linux/gfp.h>
 #include <linux/radix-tree.h>
 #include <linux/cpu.h>
+
 #include <asm/firmware.h>
 #include <asm/prom.h>
 #include <asm/io.h>
@@ -31,9 +35,6 @@
 
 #include "xics.h"
 
-/* This is used to map real irq numbers to virtual */
-static struct radix_tree_root irq_map = RADIX_TREE_INIT(GFP_ATOMIC);
-
 #define XICS_IPI		2
 #define XICS_IRQ_SPURIOUS	0
 
@@ -64,12 +65,12 @@ struct xics_ipl {
 
 static struct xics_ipl __iomem *xics_per_cpu[NR_CPUS];
 
-static int xics_irq_8259_cascade = 0;
-static int xics_irq_8259_cascade_real = 0;
 static unsigned int default_server = 0xFF;
 static unsigned int default_distrib_server = 0;
 static unsigned int interrupt_server_size = 8;
 
+static struct irq_host *xics_host;
+
 /*
  * XICS only has a single IPI, so encode the messages per CPU
  */
@@ -85,7 +86,7 @@ static int ibm_int_off;
 /* Direct HW low level accessors */
 
 
-static inline int direct_xirr_info_get(int n_cpu)
+static inline unsigned int direct_xirr_info_get(int n_cpu)
 {
 	return in_be32(&xics_per_cpu[n_cpu]->xirr.word);
 }
@@ -130,7 +131,7 @@ static inline long plpar_xirr(unsigned long *xirr_ret)
 	return plpar_hcall(H_XIRR, 0, 0, 0, 0, xirr_ret, &dummy, &dummy);
 }
 
-static inline int lpar_xirr_info_get(int n_cpu)
+static inline unsigned int lpar_xirr_info_get(int n_cpu)
 {
 	unsigned long lpar_rc;
 	unsigned long return_value;
@@ -138,7 +139,7 @@ static inline int lpar_xirr_info_get(int n_cpu)
 	lpar_rc = plpar_xirr(&return_value);
 	if (lpar_rc != H_SUCCESS)
 		panic(" bad return code xirr - rc = %lx \n", lpar_rc);
-	return (int)return_value;
+	return (unsigned int)return_value;
 }
 
 static inline void lpar_xirr_info_set(int n_cpu, int value)
@@ -175,11 +176,11 @@ static inline void lpar_qirr_info(int n_cpu , u8 value)
 
 
 #ifdef CONFIG_SMP
-static int get_irq_server(unsigned int irq)
+static int get_irq_server(unsigned int virq)
 {
 	unsigned int server;
 	/* For the moment only implement delivery to all cpus or one cpu */
-	cpumask_t cpumask = irq_desc[irq].affinity;
+	cpumask_t cpumask = irq_desc[virq].affinity;
 	cpumask_t tmp = CPU_MASK_NONE;
 
 	if (!distribute_irqs)
@@ -200,7 +201,7 @@ static int get_irq_server(unsigned int irq)
 
 }
 #else
-static int get_irq_server(unsigned int irq)
+static int get_irq_server(unsigned int virq)
 {
 	return default_server;
 }
@@ -213,9 +214,11 @@ static void xics_unmask_irq(unsigned int virq)
 	int call_status;
 	unsigned int server;
 
-	irq = virt_irq_to_real(irq_offset_down(virq));
-	WARN_ON(irq == NO_IRQ);
-	if (irq == XICS_IPI || irq == NO_IRQ)
+	pr_debug("xics: unmask virq %d\n", virq);
+
+	irq = (unsigned int)irq_map[virq].hwirq;
+	pr_debug(" -> map to hwirq 0x%x\n", irq);
+	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 		return;
 
 	server = get_irq_server(virq);
@@ -267,75 +270,57 @@ static void xics_mask_irq(unsigned int virq)
 {
 	unsigned int irq;
 
-	irq = virt_irq_to_real(irq_offset_down(virq));
-	WARN_ON(irq == NO_IRQ);
-	if (irq != NO_IRQ)
-		xics_mask_real_irq(irq);
+	pr_debug("xics: mask virq %d\n", virq);
+
+	irq = (unsigned int)irq_map[virq].hwirq;
+	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
+		return;
+	xics_mask_real_irq(irq);
 }
 
-static void xics_set_irq_revmap(unsigned int virq)
+static unsigned int xics_startup(unsigned int virq)
 {
 	unsigned int irq;
 
-	irq = irq_offset_down(virq);
-	if (radix_tree_insert(&irq_map, virt_irq_to_real(irq),
-			      &virt_irq_to_real_map[irq]) == -ENOMEM)
-		printk(KERN_CRIT "Out of memory creating real -> virtual"
-		       " IRQ mapping for irq %u (real 0x%x)\n",
-		       virq, virt_irq_to_real(irq));
-}
+	/* force a reverse mapping of the interrupt so it gets in the cache */
+	irq = (unsigned int)irq_map[virq].hwirq;
+	irq_radix_revmap(xics_host, irq);
 
-static unsigned int xics_startup(unsigned int virq)
-{
-	xics_set_irq_revmap(virq);
+	/* unmask it */
 	xics_unmask_irq(virq);
 	return 0;
 }
 
-static unsigned int real_irq_to_virt(unsigned int real_irq)
-{
-	unsigned int *ptr;
-
-	ptr = radix_tree_lookup(&irq_map, real_irq);
-	if (ptr == NULL)
-		return NO_IRQ;
-	return ptr - virt_irq_to_real_map;
-}
-
-static void xics_eoi_direct(unsigned int irq)
+static void xics_eoi_direct(unsigned int virq)
 {
 	int cpu = smp_processor_id();
+	unsigned int irq = (unsigned int)irq_map[virq].hwirq;
 
 	iosync();
-	direct_xirr_info_set(cpu, ((0xff << 24) |
-				   (virt_irq_to_real(irq_offset_down(irq)))));
+	direct_xirr_info_set(cpu, (0xff << 24) | irq);
 }
 
 
-static void xics_eoi_lpar(unsigned int irq)
+static void xics_eoi_lpar(unsigned int virq)
 {
 	int cpu = smp_processor_id();
+	unsigned int irq = (unsigned int)irq_map[virq].hwirq;
 
 	iosync();
-	lpar_xirr_info_set(cpu, ((0xff << 24) |
-				 (virt_irq_to_real(irq_offset_down(irq)))));
-
+	lpar_xirr_info_set(cpu, (0xff << 24) | irq);
 }
 
-static inline int xics_remap_irq(int vec)
+static inline unsigned int xics_remap_irq(unsigned int vec)
 {
-	int irq;
+	unsigned int irq;
 
 	vec &= 0x00ffffff;
 
 	if (vec == XICS_IRQ_SPURIOUS)
 		return NO_IRQ;
-
-	irq = real_irq_to_virt(vec);
-	if (irq == NO_IRQ)
-		irq = real_irq_to_virt_slowpath(vec);
+	irq = irq_radix_revmap(xics_host, vec);
 	if (likely(irq != NO_IRQ))
-		return irq_offset_up(irq);
+		return irq;
 
 	printk(KERN_ERR "Interrupt %u (real) is invalid,"
 	       " disabling it.\n", vec);
@@ -343,14 +328,14 @@ static inline int xics_remap_irq(int vec)
 	return NO_IRQ;
 }
 
-static int xics_get_irq_direct(struct pt_regs *regs)
+static unsigned int xics_get_irq_direct(struct pt_regs *regs)
 {
 	unsigned int cpu = smp_processor_id();
 
 	return xics_remap_irq(direct_xirr_info_get(cpu));
 }
 
-static int xics_get_irq_lpar(struct pt_regs *regs)
+static unsigned int xics_get_irq_lpar(struct pt_regs *regs)
 {
 	unsigned int cpu = smp_processor_id();
 
@@ -437,8 +422,8 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	unsigned long newmask;
 	cpumask_t tmp = CPU_MASK_NONE;
 
-	irq = virt_irq_to_real(irq_offset_down(virq));
-	if (irq == XICS_IPI || irq == NO_IRQ)
+	irq = (unsigned int)irq_map[virq].hwirq;
+	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 		return;
 
 	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
@@ -469,6 +454,24 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	}
 }
 
+void xics_setup_cpu(void)
+{
+	int cpu = smp_processor_id();
+
+	xics_set_cpu_priority(cpu, 0xff);
+
+	/*
+	 * Put the calling processor into the GIQ.  This is really only
+	 * necessary from a secondary thread as the OF start-cpu interface
+	 * performs this function for us on primary threads.
+	 *
+	 * XXX: undo of teardown on kexec needs this too, as may hotplug
+	 */
+	rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+		(1UL << interrupt_server_size) - 1 - default_distrib_server, 1);
+}
+
+
 static struct irq_chip xics_pic_direct = {
 	.typename = " XICS     ",
 	.startup = xics_startup,
@@ -489,90 +492,245 @@ static struct irq_chip xics_pic_lpar = {
 };
 
 
-void xics_setup_cpu(void)
+static int xics_host_match(struct irq_host *h, struct device_node *node)
 {
-	int cpu = smp_processor_id();
+	/* IBM machines have interrupt parents of various funky types for things
+	 * like vdevices, events, etc... The trick we use here is to match
+	 * everything here except the legacy 8259 which is compatible "chrp,iic"
+	 */
+	return !device_is_compatible(node, "chrp,iic");
+}
 
-	xics_set_cpu_priority(cpu, 0xff);
+static int xics_host_map_direct(struct irq_host *h, unsigned int virq,
+				irq_hw_number_t hw, unsigned int flags)
+{
+	unsigned int sense = flags & IRQ_TYPE_SENSE_MASK;
 
-	/*
-	 * Put the calling processor into the GIQ.  This is really only
-	 * necessary from a secondary thread as the OF start-cpu interface
-	 * performs this function for us on primary threads.
-	 *
-	 * XXX: undo of teardown on kexec needs this too, as may hotplug
+	pr_debug("xics: map_direct virq %d, hwirq 0x%lx, flags: 0x%x\n",
+		 virq, hw, flags);
+
+	if (sense && sense != IRQ_TYPE_LEVEL_LOW)
+		printk(KERN_WARNING "xics: using unsupported sense 0x%x"
+		       " for irq %d (h: 0x%lx)\n", flags, virq, hw);
+
+	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	set_irq_chip_and_handler(virq, &xics_pic_direct, handle_fasteoi_irq);
+	return 0;
+}
+
+static int xics_host_map_lpar(struct irq_host *h, unsigned int virq,
+			      irq_hw_number_t hw, unsigned int flags)
+{
+	unsigned int sense = flags & IRQ_TYPE_SENSE_MASK;
+
+	pr_debug("xics: map_lpar virq %d, hwirq 0x%lx, flags: 0x%x\n",
+		 virq, hw, flags);
+
+	if (sense && sense != IRQ_TYPE_LEVEL_LOW)
+		printk(KERN_WARNING "xics: using unsupported sense 0x%x"
+		       " for irq %d (h: 0x%lx)\n", flags, virq, hw);
+
+	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	set_irq_chip_and_handler(virq, &xics_pic_lpar, handle_fasteoi_irq);
+	return 0;
+}
+
+static int xics_host_xlate(struct irq_host *h, struct device_node *ct,
+			   u32 *intspec, unsigned int intsize,
+			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+
+{
+	/* Current xics implementation translates everything
+	 * to level. It is not technically right for MSIs but this
+	 * is irrelevant at this point. We might get smarter in the future
 	 */
-	rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
-		(1UL << interrupt_server_size) - 1 - default_distrib_server, 1);
+	*out_hwirq = intspec[0];
+	*out_flags = IRQ_TYPE_LEVEL_LOW;
+
+	return 0;
+}
+
+static struct irq_host_ops xics_host_direct_ops = {
+	.match = xics_host_match,
+	.map = xics_host_map_direct,
+	.xlate = xics_host_xlate,
+};
+
+static struct irq_host_ops xics_host_lpar_ops = {
+	.match = xics_host_match,
+	.map = xics_host_map_lpar,
+	.xlate = xics_host_xlate,
+};
+
+static void __init xics_init_host(void)
+{
+	struct irq_host_ops *ops;
+
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		ops = &xics_host_lpar_ops;
+	else
+		ops = &xics_host_direct_ops;
+	xics_host = irq_alloc_host(IRQ_HOST_MAP_TREE, 0, ops,
+				   XICS_IRQ_SPURIOUS);
+	BUG_ON(xics_host == NULL);
+	irq_set_default_host(xics_host);
 }
 
-void xics_init_IRQ(void)
+static void __init xics_map_one_cpu(int hw_id, unsigned long addr,
+				     unsigned long size)
 {
+#ifdef CONFIG_SMP
 	int i;
-	unsigned long intr_size = 0;
-	struct device_node *np;
-	uint *ireg, ilen, indx = 0;
-	unsigned long intr_base = 0;
-	struct xics_interrupt_node {
-		unsigned long addr;
-		unsigned long size;
-	} intnodes[NR_CPUS];
-	struct irq_chip *chip;
 
-	ppc64_boot_msg(0x20, "XICS Init");
+	/* This may look gross but it's good enough for now, we don't quite
+	 * have a hard -> linux processor id matching.
+	 */
+	for_each_possible_cpu(i) {
+		if (!cpu_present(i))
+			continue;
+		if (hw_id == get_hard_smp_processor_id(i)) {
+			xics_per_cpu[i] = ioremap(addr, size);
+			return;
+		}
+	}
+#else
+	if (hw_id != 0)
+		return;
+	xics_per_cpu[0] = ioremap(addr, size);
+#endif /* CONFIG_SMP */
+}
 
-	ibm_get_xive = rtas_token("ibm,get-xive");
-	ibm_set_xive = rtas_token("ibm,set-xive");
-	ibm_int_on  = rtas_token("ibm,int-on");
-	ibm_int_off = rtas_token("ibm,int-off");
+static void __init xics_init_one_node(struct device_node *np,
+				      unsigned int *indx)
+{
+	unsigned int ilen;
+	u32 *ireg;
 
-	np = of_find_node_by_type(NULL, "PowerPC-External-Interrupt-Presentation");
-	if (!np)
-		panic("xics_init_IRQ: can't find interrupt presentation");
+	/* This code does the theorically broken assumption that the interrupt
+	 * server numbers are the same as the hard CPU numbers.
+	 * This happens to be the case so far but we are playing with fire...
+	 * should be fixed one of these days. -BenH.
+	 */
+	ireg = (u32 *)get_property(np, "ibm,interrupt-server-ranges", NULL);
 
-nextnode:
-	ireg = (uint *)get_property(np, "ibm,interrupt-server-ranges", NULL);
+	/* Do that ever happen ? we'll know soon enough... but even good'old
+	 * f80 does have that property ..
+	 */
+	WARN_ON(ireg == NULL);
 	if (ireg) {
 		/*
 		 * set node starting index for this node
 		 */
-		indx = *ireg;
+		*indx = *ireg;
 	}
-
-	ireg = (uint *)get_property(np, "reg", &ilen);
+	ireg = (u32 *)get_property(np, "reg", &ilen);
 	if (!ireg)
 		panic("xics_init_IRQ: can't find interrupt reg property");
 
-	while (ilen) {
-		intnodes[indx].addr = (unsigned long)*ireg++ << 32;
-		ilen -= sizeof(uint);
-		intnodes[indx].addr |= *ireg++;
-		ilen -= sizeof(uint);
-		intnodes[indx].size = (unsigned long)*ireg++ << 32;
-		ilen -= sizeof(uint);
-		intnodes[indx].size |= *ireg++;
-		ilen -= sizeof(uint);
-		indx++;
-		if (indx >= NR_CPUS) break;
+	while (ilen >= (4 * sizeof(u32))) {
+		unsigned long addr, size;
+
+		/* XXX Use proper OF parsing code here !!! */
+		addr = (unsigned long)*ireg++ << 32;
+		ilen -= sizeof(u32);
+		addr |= *ireg++;
+		ilen -= sizeof(u32);
+		size = (unsigned long)*ireg++ << 32;
+		ilen -= sizeof(u32);
+		size |= *ireg++;
+		ilen -= sizeof(u32);
+		xics_map_one_cpu(*indx, addr, size);
+		(*indx)++;
+	}
+}
+
+
+static void __init xics_setup_8259_cascade(void)
+{
+	struct device_node *np, *old, *found = NULL;
+	int cascade, naddr;
+	u32 *addrp;
+	unsigned long intack = 0;
+
+	for_each_node_by_type(np, "interrupt-controller")
+		if (device_is_compatible(np, "chrp,iic")) {
+			found = np;
+			break;
+		}
+	if (found == NULL) {
+		printk(KERN_DEBUG "xics: no ISA interrupt controller\n");
+		return;
 	}
+	cascade = irq_of_parse_and_map(found, 0);
+	if (cascade == NO_IRQ) {
+		printk(KERN_ERR "xics: failed to map cascade interrupt");
+		return;
+	}
+	pr_debug("xics: cascade mapped to irq %d\n", cascade);
+
+	for (old = of_node_get(found); old != NULL ; old = np) {
+		np = of_get_parent(old);
+		of_node_put(old);
+		if (np == NULL)
+			break;
+		if (strcmp(np->name, "pci") != 0)
+			continue;
+		addrp = (u32 *)get_property(np, "8259-interrupt-acknowledge", NULL);
+		if (addrp == NULL)
+			continue;
+		naddr = prom_n_addr_cells(np);
+		intack = addrp[naddr-1];
+		if (naddr > 1)
+			intack |= ((unsigned long)addrp[naddr-2]) << 32;
+	}
+	if (intack)
+		printk(KERN_DEBUG "xics: PCI 8259 intack at 0x%016lx\n", intack);
+	i8259_init(found, intack);
+	of_node_put(found);
+	set_irq_chained_handler(cascade, pseries_8259_cascade);
+}
 
-	np = of_find_node_by_type(np, "PowerPC-External-Interrupt-Presentation");
-	if ((indx < NR_CPUS) && np) goto nextnode;
+void __init xics_init_IRQ(void)
+{
+	int i;
+	struct device_node *np;
+	u32 *ireg, ilen, indx = 0;
+	int found = 0;
+
+	ppc64_boot_msg(0x20, "XICS Init");
+
+	ibm_get_xive = rtas_token("ibm,get-xive");
+	ibm_set_xive = rtas_token("ibm,set-xive");
+	ibm_int_on  = rtas_token("ibm,int-on");
+	ibm_int_off = rtas_token("ibm,int-off");
+
+	for_each_node_by_type(np, "PowerPC-External-Interrupt-Presentation") {
+		found = 1;
+		if (firmware_has_feature(FW_FEATURE_LPAR))
+			break;
+		xics_init_one_node(np, &indx);
+	}
+	if (found == 0)
+		return;
+
+	xics_init_host();
 
 	/* Find the server numbers for the boot cpu. */
 	for (np = of_find_node_by_type(NULL, "cpu");
 	     np;
 	     np = of_find_node_by_type(np, "cpu")) {
-		ireg = (uint *)get_property(np, "reg", &ilen);
+		ireg = (u32 *)get_property(np, "reg", &ilen);
 		if (ireg && ireg[0] == get_hard_smp_processor_id(boot_cpuid)) {
-			ireg = (uint *)get_property(np, "ibm,ppc-interrupt-gserver#s",
-						    &ilen);
+			ireg = (u32 *)get_property(np,
+						  "ibm,ppc-interrupt-gserver#s",
+						   &ilen);
 			i = ilen / sizeof(int);
 			if (ireg && i > 0) {
 				default_server = ireg[0];
-				default_distrib_server = ireg[i-1]; /* take last element */
+				/* take last element */
+				default_distrib_server = ireg[i-1];
 			}
-			ireg = (uint *)get_property(np,
+			ireg = (u32 *)get_property(np,
 					"ibm,interrupt-server#-size", NULL);
 			if (ireg)
 				interrupt_server_size = *ireg;
@@ -581,102 +739,48 @@ nextnode:
 	}
 	of_node_put(np);
 
-	intr_base = intnodes[0].addr;
-	intr_size = intnodes[0].size;
-
- 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
- 		ppc_md.get_irq = xics_get_irq_lpar;
- 		chip = &xics_pic_lpar;
-  	} else {
-#ifdef CONFIG_SMP
-		for_each_possible_cpu(i) {
-			int hard_id;
-
-			/* FIXME: Do this dynamically! --RR */
-			if (!cpu_present(i))
-				continue;
-
-			hard_id = get_hard_smp_processor_id(i);
-			xics_per_cpu[i] = ioremap(intnodes[hard_id].addr,
-						  intnodes[hard_id].size);
-		}
-#else
-		xics_per_cpu[0] = ioremap(intr_base, intr_size);
-#endif /* CONFIG_SMP */
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		ppc_md.get_irq = xics_get_irq_lpar;
+	else
 		ppc_md.get_irq = xics_get_irq_direct;
-		chip = &xics_pic_direct;
-
-	}
-
-	for (i = irq_offset_value(); i < NR_IRQS; ++i) {
-		/* All IRQs on XICS are level for now. MSI code may want to modify
-		 * that for reporting purposes
-		 */
-		get_irq_desc(i)->status |= IRQ_LEVEL;
-		set_irq_chip_and_handler(i, chip, handle_fasteoi_irq);
-	}
 
 	xics_setup_cpu();
 
-	ppc64_boot_msg(0x21, "XICS Done");
-}
+	xics_setup_8259_cascade();
 
-static int xics_setup_8259_cascade(void)
-{
-	struct device_node *np;
-	uint *ireg;
-
-	np = of_find_node_by_type(NULL, "interrupt-controller");
-	if (np == NULL) {
-		printk(KERN_WARNING "xics: no ISA interrupt controller\n");
-		xics_irq_8259_cascade_real = -1;
-		xics_irq_8259_cascade = -1;
-		return 0;
-	}
-
-	ireg = (uint *) get_property(np, "interrupts", NULL);
-	if (!ireg)
-		panic("xics_init_IRQ: can't find ISA interrupts property");
-
-	xics_irq_8259_cascade_real = *ireg;
-	xics_irq_8259_cascade = irq_offset_up
-		(virt_irq_create_mapping(xics_irq_8259_cascade_real));
-	i8259_init(0, 0);
-	of_node_put(np);
-
-	xics_set_irq_revmap(xics_irq_8259_cascade);
-	set_irq_chained_handler(xics_irq_8259_cascade, pSeries_8259_cascade);
-
-	return 0;
+	ppc64_boot_msg(0x21, "XICS Done");
 }
-arch_initcall(xics_setup_8259_cascade);
 
 
 #ifdef CONFIG_SMP
 void xics_request_IPIs(void)
 {
-	virt_irq_to_real_map[XICS_IPI] = XICS_IPI;
+	unsigned int ipi;
+
+	ipi = irq_create_mapping(xics_host, XICS_IPI, 0);
+	BUG_ON(ipi == NO_IRQ);
 
 	/*
 	 * IPIs are marked IRQF_DISABLED as they must run with irqs
 	 * disabled
 	 */
-	set_irq_handler(irq_offset_up(XICS_IPI), handle_percpu_irq);
+	set_irq_handler(ipi, handle_percpu_irq);
 	if (firmware_has_feature(FW_FEATURE_LPAR))
-		request_irq(irq_offset_up(XICS_IPI), xics_ipi_action_lpar,
-			    SA_INTERRUPT, "IPI", NULL);
+		request_irq(ipi, xics_ipi_action_lpar, IRQF_DISABLED,
+			    "IPI", NULL);
 	else
-		request_irq(irq_offset_up(XICS_IPI), xics_ipi_action_direct,
-			    SA_INTERRUPT, "IPI", NULL);
+		request_irq(ipi, xics_ipi_action_direct, IRQF_DISABLED,
+			    "IPI", NULL);
 }
 #endif /* CONFIG_SMP */
 
 void xics_teardown_cpu(int secondary)
 {
-	struct irq_desc *desc = get_irq_desc(irq_offset_up(XICS_IPI));
 	int cpu = smp_processor_id();
+	unsigned int ipi;
+	struct irq_desc *desc;
 
- 	xics_set_cpu_priority(cpu, 0);
+	xics_set_cpu_priority(cpu, 0);
 
 	/*
 	 * we need to EOI the IPI if we got here from kexec down IPI
@@ -685,6 +789,11 @@ void xics_teardown_cpu(int secondary)
 	 * should we be flagging idle loop instead?
 	 * or creating some task to be scheduled?
 	 */
+
+	ipi = irq_find_mapping(xics_host, XICS_IPI);
+	if (ipi == XICS_IRQ_SPURIOUS)
+		return;
+	desc = get_irq_desc(ipi);
 	if (desc->chip && desc->chip->eoi)
 		desc->chip->eoi(XICS_IPI);
 
@@ -694,8 +803,8 @@ void xics_teardown_cpu(int secondary)
 	 */
 	if (secondary)
 		rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
-			(1UL << interrupt_server_size) - 1 -
-			default_distrib_server, 0);
+				   (1UL << interrupt_server_size) - 1 -
+				   default_distrib_server, 0);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -723,15 +832,15 @@ void xics_migrate_irqs_away(void)
 		unsigned long flags;
 
 		/* We cant set affinity on ISA interrupts */
-		if (virq < irq_offset_value())
+		if (virq < NUM_ISA_INTERRUPTS)
 			continue;
-
-		desc = get_irq_desc(virq);
-		irq = virt_irq_to_real(irq_offset_down(virq));
-
+		if (irq_map[virq].host != xics_host)
+			continue;
+		irq = (unsigned int)irq_map[virq].hwirq;
 		/* We need to get IPIs still. */
-		if (irq == XICS_IPI || irq == NO_IRQ)
+		if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 			continue;
+		desc = get_irq_desc(virq);
 
 		/* We only need to migrate enabled IRQS */
 		if (desc == NULL || desc->chip == NULL
diff --git a/arch/powerpc/platforms/pseries/xics.h b/arch/powerpc/platforms/pseries/xics.h
index 67dedf3514ec..6ee1055b0ffb 100644
--- a/arch/powerpc/platforms/pseries/xics.h
+++ b/arch/powerpc/platforms/pseries/xics.h
@@ -31,7 +31,7 @@ struct xics_ipi_struct {
 extern struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
 
 struct irq_desc;
-extern void pSeries_8259_cascade(unsigned int irq, struct irq_desc *desc,
+extern void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc,
 				 struct pt_regs *regs);
 
 #endif /* _POWERPC_KERNEL_XICS_H */
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index c2e9465871aa..72c73a6105cd 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -6,11 +6,16 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#undef DEBUG
+
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
 #include <asm/io.h>
 #include <asm/i8259.h>
+#include <asm/prom.h>
 
 static volatile void __iomem *pci_intack; /* RO, gives us the irq vector */
 
@@ -20,7 +25,8 @@ static unsigned char cached_8259[2] = { 0xff, 0xff };
 
 static DEFINE_SPINLOCK(i8259_lock);
 
-static int i8259_pic_irq_offset;
+static struct device_node *i8259_node;
+static struct irq_host *i8259_host;
 
 /*
  * Acknowledge the IRQ using either the PCI host bridge's interrupt
@@ -28,16 +34,18 @@ static int i8259_pic_irq_offset;
  * which is called.  It should be noted that polling is broken on some
  * IBM and Motorola PReP boxes so we must use the int-ack feature on them.
  */
-int i8259_irq(struct pt_regs *regs)
+unsigned int i8259_irq(struct pt_regs *regs)
 {
 	int irq;
-
-	spin_lock(&i8259_lock);
+	int lock = 0;
 
 	/* Either int-ack or poll for the IRQ */
 	if (pci_intack)
 		irq = readb(pci_intack);
 	else {
+		spin_lock(&i8259_lock);
+		lock = 1;
+
 		/* Perform an interrupt acknowledge cycle on controller 1. */
 		outb(0x0C, 0x20);		/* prepare for poll */
 		irq = inb(0x20) & 7;
@@ -62,11 +70,13 @@ int i8259_irq(struct pt_regs *regs)
 		if (!pci_intack)
 			outb(0x0B, 0x20);	/* ISR register */
 		if(~inb(0x20) & 0x80)
-			irq = -1;
-	}
+			irq = NO_IRQ;
+	} else if (irq == 0xff)
+		irq = NO_IRQ;
 
-	spin_unlock(&i8259_lock);
-	return irq + i8259_pic_irq_offset;
+	if (lock)
+		spin_unlock(&i8259_lock);
+	return irq;
 }
 
 static void i8259_mask_and_ack_irq(unsigned int irq_nr)
@@ -74,7 +84,6 @@ static void i8259_mask_and_ack_irq(unsigned int irq_nr)
 	unsigned long flags;
 
 	spin_lock_irqsave(&i8259_lock, flags);
-	irq_nr -= i8259_pic_irq_offset;
 	if (irq_nr > 7) {
 		cached_A1 |= 1 << (irq_nr-8);
 		inb(0xA1); 	/* DUMMY */
@@ -100,8 +109,9 @@ static void i8259_mask_irq(unsigned int irq_nr)
 {
 	unsigned long flags;
 
+	pr_debug("i8259_mask_irq(%d)\n", irq_nr);
+
 	spin_lock_irqsave(&i8259_lock, flags);
-	irq_nr -= i8259_pic_irq_offset;
 	if (irq_nr < 8)
 		cached_21 |= 1 << irq_nr;
 	else
@@ -114,8 +124,9 @@ static void i8259_unmask_irq(unsigned int irq_nr)
 {
 	unsigned long flags;
 
+	pr_debug("i8259_unmask_irq(%d)\n", irq_nr);
+
 	spin_lock_irqsave(&i8259_lock, flags);
-	irq_nr -= i8259_pic_irq_offset;
 	if (irq_nr < 8)
 		cached_21 &= ~(1 << irq_nr);
 	else
@@ -152,25 +163,84 @@ static struct resource pic_edgectrl_iores = {
 	.flags = IORESOURCE_BUSY,
 };
 
-static struct irqaction i8259_irqaction = {
-	.handler = no_action,
-	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
-	.name = "82c59 secondary cascade",
+static int i8259_host_match(struct irq_host *h, struct device_node *node)
+{
+	return i8259_node == NULL || i8259_node == node;
+}
+
+static int i8259_host_map(struct irq_host *h, unsigned int virq,
+			  irq_hw_number_t hw, unsigned int flags)
+{
+	pr_debug("i8259_host_map(%d, 0x%lx)\n", virq, hw);
+
+	/* We block the internal cascade */
+	if (hw == 2)
+		get_irq_desc(virq)->status |= IRQ_NOREQUEST;
+
+	/* We use the level stuff only for now, we might want to
+	 * be more cautious here but that works for now
+	 */
+	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	set_irq_chip_and_handler(virq, &i8259_pic, handle_level_irq);
+	return 0;
+}
+
+static void i8259_host_unmap(struct irq_host *h, unsigned int virq)
+{
+	/* Make sure irq is masked in hardware */
+	i8259_mask_irq(virq);
+
+	/* remove chip and handler */
+	set_irq_chip_and_handler(virq, NULL, NULL);
+
+	/* Make sure it's completed */
+	synchronize_irq(virq);
+}
+
+static int i8259_host_xlate(struct irq_host *h, struct device_node *ct,
+			    u32 *intspec, unsigned int intsize,
+			    irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+{
+	static unsigned char map_isa_senses[4] = {
+		IRQ_TYPE_LEVEL_LOW,
+		IRQ_TYPE_LEVEL_HIGH,
+		IRQ_TYPE_EDGE_FALLING,
+		IRQ_TYPE_EDGE_RISING,
+	};
+
+	*out_hwirq = intspec[0];
+	if (intsize > 1 && intspec[1] < 4)
+		*out_flags = map_isa_senses[intspec[1]];
+	else
+		*out_flags = IRQ_TYPE_NONE;
+
+	return 0;
+}
+
+static struct irq_host_ops i8259_host_ops = {
+	.match = i8259_host_match,
+	.map = i8259_host_map,
+	.unmap = i8259_host_unmap,
+	.xlate = i8259_host_xlate,
 };
 
-/*
- * i8259_init()
- * intack_addr - PCI interrupt acknowledge (real) address which will return
- *               the active irq from the 8259
+/****
+ * i8259_init - Initialize the legacy controller
+ * @node: device node of the legacy PIC (can be NULL, but then, it will match
+ *        all interrupts, so beware)
+ * @intack_addr: PCI interrupt acknowledge (real) address which will return
+ *             	 the active irq from the 8259
  */
-void __init i8259_init(unsigned long intack_addr, int offset)
+void i8259_init(struct device_node *node, unsigned long intack_addr)
 {
 	unsigned long flags;
-	int i;
 
+	/* initialize the controller */
 	spin_lock_irqsave(&i8259_lock, flags);
-	i8259_pic_irq_offset = offset;
+
+	/* Mask all first */
+	outb(0xff, 0xA1);
+	outb(0xff, 0x21);
 
 	/* init master interrupt controller */
 	outb(0x11, 0x20); /* Start init sequence */
@@ -184,24 +254,36 @@ void __init i8259_init(unsigned long intack_addr, int offset)
 	outb(0x02, 0xA1); /* edge triggered, Cascade (slave) on IRQ2 */
 	outb(0x01, 0xA1); /* Select 8086 mode */
 
+	/* That thing is slow */
+	udelay(100);
+
 	/* always read ISR */
 	outb(0x0B, 0x20);
 	outb(0x0B, 0xA0);
 
-	/* Mask all interrupts */
+	/* Unmask the internal cascade */
+	cached_21 &= ~(1 << 2);
+
+	/* Set interrupt masks */
 	outb(cached_A1, 0xA1);
 	outb(cached_21, 0x21);
 
 	spin_unlock_irqrestore(&i8259_lock, flags);
 
-	for (i = 0; i < NUM_ISA_INTERRUPTS; ++i) {
-		set_irq_chip_and_handler(offset + i, &i8259_pic,
-					 handle_level_irq);
-		irq_desc[offset + i].status |= IRQ_LEVEL;
+	/* create a legacy host */
+	if (node)
+		i8259_node = of_node_get(node);
+	i8259_host = irq_alloc_host(IRQ_HOST_MAP_LEGACY, 0, &i8259_host_ops, 0);
+	if (i8259_host == NULL) {
+		printk(KERN_ERR "i8259: failed to allocate irq host !\n");
+		return;
 	}
 
 	/* reserve our resources */
-	setup_irq(offset + 2, &i8259_irqaction);
+	/* XXX should we continue doing that ? it seems to cause problems
+	 * with further requesting of PCI IO resources for that range...
+	 * need to look into it.
+	 */
 	request_resource(&ioport_resource, &pic1_iores);
 	request_resource(&ioport_resource, &pic2_iores);
 	request_resource(&ioport_resource, &pic_edgectrl_iores);
@@ -209,4 +291,5 @@ void __init i8259_init(unsigned long intack_addr, int offset)
 	if (intack_addr != 0)
 		pci_intack = ioremap(intack_addr, 1);
 
+	printk(KERN_INFO "i8259 legacy interrupt controller initialized\n");
 }
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 9a95f16c19a5..7d31d7cc392d 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -340,27 +340,19 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic)
 #endif /* CONFIG_MPIC_BROKEN_U3 */
 
 
+#define mpic_irq_to_hw(virq)	((unsigned int)irq_map[virq].hwirq)
+
 /* Find an mpic associated with a given linux interrupt */
 static struct mpic *mpic_find(unsigned int irq, unsigned int *is_ipi)
 {
-	struct mpic *mpic = mpics;
-
-	while(mpic) {
-		/* search IPIs first since they may override the main interrupts */
-		if (irq >= mpic->ipi_offset && irq < (mpic->ipi_offset + 4)) {
-			if (is_ipi)
-				*is_ipi = 1;
-			return mpic;
-		}
-		if (irq >= mpic->irq_offset &&
-		    irq < (mpic->irq_offset + mpic->irq_count)) {
-			if (is_ipi)
-				*is_ipi = 0;
-			return mpic;
-		}
-		mpic = mpic -> next;
-	}
-	return NULL;
+	unsigned int src = mpic_irq_to_hw(irq);
+
+	if (irq < NUM_ISA_INTERRUPTS)
+		return NULL;
+	if (is_ipi)
+		*is_ipi = (src >= MPIC_VEC_IPI_0 && src <= MPIC_VEC_IPI_3);
+
+	return irq_desc[irq].chip_data;
 }
 
 /* Convert a cpu mask from logical to physical cpu numbers. */
@@ -398,9 +390,7 @@ static inline void mpic_eoi(struct mpic *mpic)
 #ifdef CONFIG_SMP
 static irqreturn_t mpic_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
 {
-	struct mpic *mpic = dev_id;
-
-	smp_message_recv(irq - mpic->ipi_offset, regs);
+	smp_message_recv(mpic_irq_to_hw(irq) - MPIC_VEC_IPI_0, regs);
 	return IRQ_HANDLED;
 }
 #endif /* CONFIG_SMP */
@@ -414,7 +404,7 @@ static void mpic_unmask_irq(unsigned int irq)
 {
 	unsigned int loops = 100000;
 	struct mpic *mpic = mpic_from_irq(irq);
-	unsigned int src = irq - mpic->irq_offset;
+	unsigned int src = mpic_irq_to_hw(irq);
 
 	DBG("%p: %s: enable_irq: %d (src %d)\n", mpic, mpic->name, irq, src);
 
@@ -435,7 +425,7 @@ static void mpic_mask_irq(unsigned int irq)
 {
 	unsigned int loops = 100000;
 	struct mpic *mpic = mpic_from_irq(irq);
-	unsigned int src = irq - mpic->irq_offset;
+	unsigned int src = mpic_irq_to_hw(irq);
 
 	DBG("%s: disable_irq: %d (src %d)\n", mpic->name, irq, src);
 
@@ -472,7 +462,7 @@ static void mpic_end_irq(unsigned int irq)
 static void mpic_unmask_ht_irq(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
-	unsigned int src = irq - mpic->irq_offset;
+	unsigned int src = mpic_irq_to_hw(irq);
 
 	mpic_unmask_irq(irq);
 
@@ -483,7 +473,7 @@ static void mpic_unmask_ht_irq(unsigned int irq)
 static unsigned int mpic_startup_ht_irq(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
-	unsigned int src = irq - mpic->irq_offset;
+	unsigned int src = mpic_irq_to_hw(irq);
 
 	mpic_unmask_irq(irq);
 	mpic_startup_ht_interrupt(mpic, src, irq_desc[irq].status);
@@ -494,7 +484,7 @@ static unsigned int mpic_startup_ht_irq(unsigned int irq)
 static void mpic_shutdown_ht_irq(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
-	unsigned int src = irq - mpic->irq_offset;
+	unsigned int src = mpic_irq_to_hw(irq);
 
 	mpic_shutdown_ht_interrupt(mpic, src, irq_desc[irq].status);
 	mpic_mask_irq(irq);
@@ -503,7 +493,7 @@ static void mpic_shutdown_ht_irq(unsigned int irq)
 static void mpic_end_ht_irq(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
-	unsigned int src = irq - mpic->irq_offset;
+	unsigned int src = mpic_irq_to_hw(irq);
 
 #ifdef DEBUG_IRQ
 	DBG("%s: end_irq: %d\n", mpic->name, irq);
@@ -525,7 +515,7 @@ static void mpic_end_ht_irq(unsigned int irq)
 static void mpic_unmask_ipi(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_ipi(irq);
-	unsigned int src = irq - mpic->ipi_offset;
+	unsigned int src = mpic_irq_to_hw(irq) - MPIC_VEC_IPI_0;
 
 	DBG("%s: enable_ipi: %d (ipi %d)\n", mpic->name, irq, src);
 	mpic_ipi_write(src, mpic_ipi_read(src) & ~MPIC_VECPRI_MASK);
@@ -555,15 +545,46 @@ static void mpic_end_ipi(unsigned int irq)
 static void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = mpic_irq_to_hw(irq);
 
 	cpumask_t tmp;
 
 	cpus_and(tmp, cpumask, cpu_online_map);
 
-	mpic_irq_write(irq - mpic->irq_offset, MPIC_IRQ_DESTINATION,
+	mpic_irq_write(src, MPIC_IRQ_DESTINATION,
 		       mpic_physmask(cpus_addr(tmp)[0]));	
 }
 
+static unsigned int mpic_flags_to_vecpri(unsigned int flags, int *level)
+{
+	unsigned int vecpri;
+
+	/* Now convert sense value */
+	switch(flags & IRQ_TYPE_SENSE_MASK) {
+	case IRQ_TYPE_EDGE_RISING:
+		vecpri = MPIC_VECPRI_SENSE_EDGE |
+			MPIC_VECPRI_POLARITY_POSITIVE;
+		*level = 0;
+		break;
+	case IRQ_TYPE_EDGE_FALLING:
+		vecpri = MPIC_VECPRI_SENSE_EDGE |
+			MPIC_VECPRI_POLARITY_NEGATIVE;
+		*level = 0;
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+		vecpri = MPIC_VECPRI_SENSE_LEVEL |
+			MPIC_VECPRI_POLARITY_POSITIVE;
+		*level = 1;
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+	default:
+		vecpri = MPIC_VECPRI_SENSE_LEVEL |
+			MPIC_VECPRI_POLARITY_NEGATIVE;
+		*level = 1;
+	}
+	return vecpri;
+}
+
 static struct irq_chip mpic_irq_chip = {
 	.mask	= mpic_mask_irq,
 	.unmask	= mpic_unmask_irq,
@@ -589,19 +610,111 @@ static struct irq_chip mpic_irq_ht_chip = {
 #endif /* CONFIG_MPIC_BROKEN_U3 */
 
 
+static int mpic_host_match(struct irq_host *h, struct device_node *node)
+{
+	struct mpic *mpic = h->host_data;
+
+	/* Exact match, unless mpic node is NULL */
+	return mpic->of_node == NULL || mpic->of_node == node;
+}
+
+static int mpic_host_map(struct irq_host *h, unsigned int virq,
+			 irq_hw_number_t hw, unsigned int flags)
+{
+	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_chip *chip;
+	struct mpic *mpic = h->host_data;
+	unsigned int vecpri = MPIC_VECPRI_SENSE_LEVEL |
+		MPIC_VECPRI_POLARITY_NEGATIVE;
+	int level;
+
+	pr_debug("mpic: map virq %d, hwirq 0x%lx, flags: 0x%x\n",
+		 virq, hw, flags);
+
+	if (hw == MPIC_VEC_SPURRIOUS)
+		return -EINVAL;
+#ifdef CONFIG_SMP
+	else if (hw >= MPIC_VEC_IPI_0) {
+		WARN_ON(!(mpic->flags & MPIC_PRIMARY));
+
+		pr_debug("mpic: mapping as IPI\n");
+		set_irq_chip_data(virq, mpic);
+		set_irq_chip_and_handler(virq, &mpic->hc_ipi,
+					 handle_percpu_irq);
+		return 0;
+	}
+#endif /* CONFIG_SMP */
+
+	if (hw >= mpic->irq_count)
+		return -EINVAL;
+
+	/* If no sense provided, check default sense array */
+	if (((flags & IRQ_TYPE_SENSE_MASK) == IRQ_TYPE_NONE) &&
+	    mpic->senses && hw < mpic->senses_count)
+		flags |= mpic->senses[hw];
+
+	vecpri = mpic_flags_to_vecpri(flags, &level);
+	if (level)
+		desc->status |= IRQ_LEVEL;
+	chip = &mpic->hc_irq;
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+	/* Check for HT interrupts, override vecpri */
+	if (mpic_is_ht_interrupt(mpic, hw)) {
+		vecpri &= ~(MPIC_VECPRI_SENSE_MASK |
+			    MPIC_VECPRI_POLARITY_MASK);
+		vecpri |= MPIC_VECPRI_POLARITY_POSITIVE;
+		chip = &mpic->hc_ht_irq;
+	}
+#endif
+
+	/* Reconfigure irq */
+	vecpri |= MPIC_VECPRI_MASK | hw | (8 << MPIC_VECPRI_PRIORITY_SHIFT);
+	mpic_irq_write(hw, MPIC_IRQ_VECTOR_PRI, vecpri);
+
+	pr_debug("mpic: mapping as IRQ\n");
+
+	set_irq_chip_data(virq, mpic);
+	set_irq_chip_and_handler(virq, chip, handle_fasteoi_irq);
+	return 0;
+}
+
+static int mpic_host_xlate(struct irq_host *h, struct device_node *ct,
+			   u32 *intspec, unsigned int intsize,
+			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+
+{
+	static unsigned char map_mpic_senses[4] = {
+		IRQ_TYPE_EDGE_RISING,
+		IRQ_TYPE_LEVEL_LOW,
+		IRQ_TYPE_LEVEL_HIGH,
+		IRQ_TYPE_EDGE_FALLING,
+	};
+
+	*out_hwirq = intspec[0];
+	if (intsize > 1 && intspec[1] < 4)
+		*out_flags = map_mpic_senses[intspec[1]];
+	else
+		*out_flags = IRQ_TYPE_NONE;
+
+	return 0;
+}
+
+static struct irq_host_ops mpic_host_ops = {
+	.match = mpic_host_match,
+	.map = mpic_host_map,
+	.xlate = mpic_host_xlate,
+};
+
 /*
  * Exported functions
  */
 
-
-struct mpic * __init mpic_alloc(unsigned long phys_addr,
+struct mpic * __init mpic_alloc(struct device_node *node,
+				unsigned long phys_addr,
 				unsigned int flags,
 				unsigned int isu_size,
-				unsigned int irq_offset,
 				unsigned int irq_count,
-				unsigned int ipi_offset,
-				unsigned char *senses,
-				unsigned int senses_count,
 				const char *name)
 {
 	struct mpic	*mpic;
@@ -613,10 +726,19 @@ struct mpic * __init mpic_alloc(unsigned long phys_addr,
 	if (mpic == NULL)
 		return NULL;
 	
-
 	memset(mpic, 0, sizeof(struct mpic));
 	mpic->name = name;
+	mpic->of_node = node ? of_node_get(node) : NULL;
 
+	mpic->irqhost = irq_alloc_host(IRQ_HOST_MAP_LINEAR, 256,
+				       &mpic_host_ops,
+				       MPIC_VEC_SPURRIOUS);
+	if (mpic->irqhost == NULL) {
+		of_node_put(node);
+		return NULL;
+	}
+
+	mpic->irqhost->host_data = mpic;
 	mpic->hc_irq = mpic_irq_chip;
 	mpic->hc_irq.typename = name;
 	if (flags & MPIC_PRIMARY)
@@ -628,18 +750,14 @@ struct mpic * __init mpic_alloc(unsigned long phys_addr,
 		mpic->hc_ht_irq.set_affinity = mpic_set_affinity;
 #endif /* CONFIG_MPIC_BROKEN_U3 */
 #ifdef CONFIG_SMP
-	mpic->hc_ipi.typename = name;
 	mpic->hc_ipi = mpic_ipi_chip;
+	mpic->hc_ipi.typename = name;
 #endif /* CONFIG_SMP */
 
 	mpic->flags = flags;
 	mpic->isu_size = isu_size;
-	mpic->irq_offset = irq_offset;
 	mpic->irq_count = irq_count;
-	mpic->ipi_offset = ipi_offset;
 	mpic->num_sources = 0; /* so far */
-	mpic->senses = senses;
-	mpic->senses_count = senses_count;
 
 	/* Map the global registers */
 	mpic->gregs = ioremap(phys_addr + MPIC_GREG_BASE, 0x1000);
@@ -707,8 +825,10 @@ struct mpic * __init mpic_alloc(unsigned long phys_addr,
 	mpic->next = mpics;
 	mpics = mpic;
 
-	if (flags & MPIC_PRIMARY)
+	if (flags & MPIC_PRIMARY) {
 		mpic_primary = mpic;
+		irq_set_default_host(mpic->irqhost);
+	}
 
 	return mpic;
 }
@@ -725,11 +845,22 @@ void __init mpic_assign_isu(struct mpic *mpic, unsigned int isu_num,
 		mpic->num_sources = isu_first + mpic->isu_size;
 }
 
+void __init mpic_set_default_senses(struct mpic *mpic, u8 *senses, int count)
+{
+	mpic->senses = senses;
+	mpic->senses_count = count;
+}
+
 void __init mpic_init(struct mpic *mpic)
 {
 	int i;
 
 	BUG_ON(mpic->num_sources == 0);
+	WARN_ON(mpic->num_sources > MPIC_VEC_IPI_0);
+
+	/* Sanitize source count */
+	if (mpic->num_sources > MPIC_VEC_IPI_0)
+		mpic->num_sources = MPIC_VEC_IPI_0;
 
 	printk(KERN_INFO "mpic: Initializing for %d sources\n", mpic->num_sources);
 
@@ -753,14 +884,6 @@ void __init mpic_init(struct mpic *mpic)
 			       MPIC_VECPRI_MASK |
 			       (10 << MPIC_VECPRI_PRIORITY_SHIFT) |
 			       (MPIC_VEC_IPI_0 + i));
-#ifdef CONFIG_SMP
-		if (!(mpic->flags & MPIC_PRIMARY))
-			continue;
-		set_irq_chip_data(mpic->ipi_offset+i, mpic);
-		set_irq_chip_and_handler(mpic->ipi_offset+i,
-					 &mpic->hc_ipi,
-					 handle_percpu_irq);
-#endif /* CONFIG_SMP */
 	}
 
 	/* Initialize interrupt sources */
@@ -777,25 +900,15 @@ void __init mpic_init(struct mpic *mpic)
 	for (i = 0; i < mpic->num_sources; i++) {
 		/* start with vector = source number, and masked */
 		u32 vecpri = MPIC_VECPRI_MASK | i | (8 << MPIC_VECPRI_PRIORITY_SHIFT);
-		int level = 0;
+		int level = 1;
 		
-		/* if it's an IPI, we skip it */
-		if ((mpic->irq_offset + i) >= (mpic->ipi_offset + i) &&
-		    (mpic->irq_offset + i) <  (mpic->ipi_offset + i + 4))
-			continue;
-
 		/* do senses munging */
-		if (mpic->senses && i < mpic->senses_count) {
-			if (mpic->senses[i] & IRQ_SENSE_LEVEL)
-				vecpri |= MPIC_VECPRI_SENSE_LEVEL;
-			if (mpic->senses[i] & IRQ_POLARITY_POSITIVE)
-				vecpri |= MPIC_VECPRI_POLARITY_POSITIVE;
-		} else
+		if (mpic->senses && i < mpic->senses_count)
+			vecpri = mpic_flags_to_vecpri(mpic->senses[i],
+						      &level);
+		else
 			vecpri |= MPIC_VECPRI_SENSE_LEVEL;
 
-		/* remember if it was a level interrupts */
-		level = (vecpri & MPIC_VECPRI_SENSE_LEVEL);
-
 		/* deal with broken U3 */
 		if (mpic->flags & MPIC_BROKEN_U3) {
 #ifdef CONFIG_MPIC_BROKEN_U3
@@ -816,21 +929,6 @@ void __init mpic_init(struct mpic *mpic)
 		mpic_irq_write(i, MPIC_IRQ_VECTOR_PRI, vecpri);
 		mpic_irq_write(i, MPIC_IRQ_DESTINATION,
 			       1 << hard_smp_processor_id());
-
-		/* init linux descriptors */
-		if (i < mpic->irq_count) {
-			struct irq_chip *chip = &mpic->hc_irq;
-
-			irq_desc[mpic->irq_offset+i].status |=
-				level ? IRQ_LEVEL : 0;
-#ifdef CONFIG_MPIC_BROKEN_U3
-			if (mpic_is_ht_interrupt(mpic, i))
-				chip = &mpic->hc_ht_irq;
-#endif /* CONFIG_MPIC_BROKEN_U3 */
-			set_irq_chip_data(mpic->irq_offset+i, mpic);
-			set_irq_chip_and_handler(mpic->irq_offset+i, chip,
-						 handle_fasteoi_irq);
-		}
 	}
 	
 	/* Init spurrious vector */
@@ -871,19 +969,20 @@ void mpic_irq_set_priority(unsigned int irq, unsigned int pri)
 {
 	int is_ipi;
 	struct mpic *mpic = mpic_find(irq, &is_ipi);
+	unsigned int src = mpic_irq_to_hw(irq);
 	unsigned long flags;
 	u32 reg;
 
 	spin_lock_irqsave(&mpic_lock, flags);
 	if (is_ipi) {
-		reg = mpic_ipi_read(irq - mpic->ipi_offset) &
+		reg = mpic_ipi_read(src - MPIC_VEC_IPI_0) &
 			~MPIC_VECPRI_PRIORITY_MASK;
-		mpic_ipi_write(irq - mpic->ipi_offset,
+		mpic_ipi_write(src - MPIC_VEC_IPI_0,
 			       reg | (pri << MPIC_VECPRI_PRIORITY_SHIFT));
 	} else {
-		reg = mpic_irq_read(irq - mpic->irq_offset,MPIC_IRQ_VECTOR_PRI)
+		reg = mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI)
 			& ~MPIC_VECPRI_PRIORITY_MASK;
-		mpic_irq_write(irq - mpic->irq_offset, MPIC_IRQ_VECTOR_PRI,
+		mpic_irq_write(src, MPIC_IRQ_VECTOR_PRI,
 			       reg | (pri << MPIC_VECPRI_PRIORITY_SHIFT));
 	}
 	spin_unlock_irqrestore(&mpic_lock, flags);
@@ -893,14 +992,15 @@ unsigned int mpic_irq_get_priority(unsigned int irq)
 {
 	int is_ipi;
 	struct mpic *mpic = mpic_find(irq, &is_ipi);
+	unsigned int src = mpic_irq_to_hw(irq);
 	unsigned long flags;
 	u32 reg;
 
 	spin_lock_irqsave(&mpic_lock, flags);
 	if (is_ipi)
-		reg = mpic_ipi_read(irq - mpic->ipi_offset);
+		reg = mpic_ipi_read(src = MPIC_VEC_IPI_0);
 	else
-		reg = mpic_irq_read(irq - mpic->irq_offset, MPIC_IRQ_VECTOR_PRI);
+		reg = mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI);
 	spin_unlock_irqrestore(&mpic_lock, flags);
 	return (reg & MPIC_VECPRI_PRIORITY_MASK) >> MPIC_VECPRI_PRIORITY_SHIFT;
 }
@@ -995,29 +1095,20 @@ void mpic_send_ipi(unsigned int ipi_no, unsigned int cpu_mask)
 		       mpic_physmask(cpu_mask & cpus_addr(cpu_online_map)[0]));
 }
 
-int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs)
+unsigned int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs)
 {
-	u32 irq;
+	u32 src;
 
-	irq = mpic_cpu_read(MPIC_CPU_INTACK) & MPIC_VECPRI_VECTOR_MASK;
+	src = mpic_cpu_read(MPIC_CPU_INTACK) & MPIC_VECPRI_VECTOR_MASK;
 #ifdef DEBUG_LOW
-	DBG("%s: get_one_irq(): %d\n", mpic->name, irq);
-#endif
-	if (unlikely(irq == MPIC_VEC_SPURRIOUS))
-		return -1;
-	if (irq < MPIC_VEC_IPI_0) {
-#ifdef DEBUG_IRQ
-		DBG("%s: irq %d\n", mpic->name, irq + mpic->irq_offset);
+	DBG("%s: get_one_irq(): %d\n", mpic->name, src);
 #endif
-		return irq + mpic->irq_offset;
-	}
-#ifdef DEBUG_IPI
-       	DBG("%s: ipi %d !\n", mpic->name, irq - MPIC_VEC_IPI_0);
-#endif
-	return irq - MPIC_VEC_IPI_0 + mpic->ipi_offset;
+	if (unlikely(src == MPIC_VEC_SPURRIOUS))
+		return NO_IRQ;
+	return irq_linear_revmap(mpic->irqhost, src);
 }
 
-int mpic_get_irq(struct pt_regs *regs)
+unsigned int mpic_get_irq(struct pt_regs *regs)
 {
 	struct mpic *mpic = mpic_primary;
 
@@ -1031,25 +1122,27 @@ int mpic_get_irq(struct pt_regs *regs)
 void mpic_request_ipis(void)
 {
 	struct mpic *mpic = mpic_primary;
-
+	int i;
+	static char *ipi_names[] = {
+		"IPI0 (call function)",
+		"IPI1 (reschedule)",
+		"IPI2 (unused)",
+		"IPI3 (debugger break)",
+	};
 	BUG_ON(mpic == NULL);
-	
-	printk("requesting IPIs ... \n");
 
-	/*
-	 * IPIs are marked IRQF_DISABLED as they must run with irqs
-	 * disabled
-	 */
-	request_irq(mpic->ipi_offset+0, mpic_ipi_action, IRQF_DISABLED,
-		    "IPI0 (call function)", mpic);
-	request_irq(mpic->ipi_offset+1, mpic_ipi_action, IRQF_DISABLED,
-		   "IPI1 (reschedule)", mpic);
-	request_irq(mpic->ipi_offset+2, mpic_ipi_action, IRQF_DISABLED,
-		   "IPI2 (unused)", mpic);
-	request_irq(mpic->ipi_offset+3, mpic_ipi_action, IRQF_DISABLED,
-		   "IPI3 (debugger break)", mpic);
-
-	printk("IPIs requested... \n");
+	printk(KERN_INFO "mpic: requesting IPIs ... \n");
+
+	for (i = 0; i < 4; i++) {
+		unsigned int vipi = irq_create_mapping(mpic->irqhost,
+						       MPIC_VEC_IPI_0 + i, 0);
+		if (vipi == NO_IRQ) {
+			printk(KERN_ERR "Failed to map IPI %d\n", i);
+			break;
+		}
+		request_irq(vipi, mpic_ipi_action, IRQF_DISABLED,
+			    ipi_names[i], mpic);
+	}
 }
 
 void smp_mpic_message_pass(int target, int msg)
diff --git a/drivers/char/hvsi.c b/drivers/char/hvsi.c
index 8dc205b275e3..56612a2dca6b 100644
--- a/drivers/char/hvsi.c
+++ b/drivers/char/hvsi.c
@@ -1299,13 +1299,12 @@ static int __init hvsi_console_init(void)
 		hp->inbuf_end = hp->inbuf;
 		hp->state = HVSI_CLOSED;
 		hp->vtermno = *vtermno;
-		hp->virq = virt_irq_create_mapping(irq[0]);
+		hp->virq = irq_create_mapping(NULL, irq[0], 0);
 		if (hp->virq == NO_IRQ) {
 			printk(KERN_ERR "%s: couldn't create irq mapping for 0x%x\n",
-				__FUNCTION__, hp->virq);
+				__FUNCTION__, irq[0]);
 			continue;
-		} else
-			hp->virq = irq_offset_up(hp->virq);
+		}
 
 		hvsi_count++;
 	}
diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c
index 314fc0830d90..4b08852c35ee 100644
--- a/drivers/macintosh/macio-adb.c
+++ b/drivers/macintosh/macio-adb.c
@@ -90,22 +90,12 @@ int macio_init(void)
 {
 	struct device_node *adbs;
 	struct resource r;
+	unsigned int irq;
 
 	adbs = find_compatible_devices("adb", "chrp,adb0");
 	if (adbs == 0)
 		return -ENXIO;
 
-#if 0
-	{ int i = 0;
-
-	printk("macio_adb_init: node = %p, addrs =", adbs->node);
-	while(!of_address_to_resource(adbs, i, &r))
-		printk(" %x(%x)", r.start, r.end - r.start);
-	printk(", intrs =");
-	for (i = 0; i < adbs->n_intrs; ++i)
-		printk(" %x", adbs->intrs[i].line);
-	printk("\n"); }
-#endif
 	if (of_address_to_resource(adbs, 0, &r))
 		return -ENXIO;
 	adb = ioremap(r.start, sizeof(struct adb_regs));
@@ -117,10 +107,9 @@ int macio_init(void)
 	out_8(&adb->active_lo.r, 0xff);
 	out_8(&adb->autopoll.r, APE);
 
-	if (request_irq(adbs->intrs[0].line, macio_adb_interrupt,
-			0, "ADB", (void *)0)) {
-		printk(KERN_ERR "ADB: can't get irq %d\n",
-		       adbs->intrs[0].line);
+	irq = irq_of_parse_and_map(adbs, 0);
+	if (request_irq(irq, macio_adb_interrupt, 0, "ADB", (void *)0)) {
+		printk(KERN_ERR "ADB: can't get irq %d\n", irq);
 		return -EAGAIN;
 	}
 	out_8(&adb->intr_enb.r, DFB | TAG);
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index 40ae7b6a939d..80c0c665b5f6 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -280,75 +280,128 @@ static void macio_release_dev(struct device *dev)
 static int macio_resource_quirks(struct device_node *np, struct resource *res,
 				 int index)
 {
-	if (res->flags & IORESOURCE_MEM) {
-		/* Grand Central has too large resource 0 on some machines */
-		if (index == 0 && !strcmp(np->name, "gc"))
-			res->end = res->start + 0x1ffff;
+	/* Only quirks for memory resources for now */
+	if ((res->flags & IORESOURCE_MEM) == 0)
+		return 0;
+
+	/* Grand Central has too large resource 0 on some machines */
+	if (index == 0 && !strcmp(np->name, "gc"))
+		res->end = res->start + 0x1ffff;
 
-		/* Airport has bogus resource 2 */
-		if (index >= 2 && !strcmp(np->name, "radio"))
-			return 1;
+	/* Airport has bogus resource 2 */
+	if (index >= 2 && !strcmp(np->name, "radio"))
+		return 1;
 
 #ifndef CONFIG_PPC64
-		/* DBDMAs may have bogus sizes */
-		if ((res->start & 0x0001f000) == 0x00008000)
-			res->end = res->start + 0xff;
+	/* DBDMAs may have bogus sizes */
+	if ((res->start & 0x0001f000) == 0x00008000)
+		res->end = res->start + 0xff;
 #endif /* CONFIG_PPC64 */
 
-		/* ESCC parent eats child resources. We could have added a
-		 * level of hierarchy, but I don't really feel the need
-		 * for it
-		 */
-		if (!strcmp(np->name, "escc"))
-			return 1;
-
-		/* ESCC has bogus resources >= 3 */
-		if (index >= 3 && !(strcmp(np->name, "ch-a") &&
-				    strcmp(np->name, "ch-b")))
-			return 1;
-
-		/* Media bay has too many resources, keep only first one */
-		if (index > 0 && !strcmp(np->name, "media-bay"))
-			return 1;
-
-		/* Some older IDE resources have bogus sizes */
-		if (!(strcmp(np->name, "IDE") && strcmp(np->name, "ATA") &&
-		      strcmp(np->type, "ide") && strcmp(np->type, "ata"))) {
-			if (index == 0 && (res->end - res->start) > 0xfff)
-				res->end = res->start + 0xfff;
-			if (index == 1 && (res->end - res->start) > 0xff)
-				res->end = res->start + 0xff;
-		}
+	/* ESCC parent eats child resources. We could have added a
+	 * level of hierarchy, but I don't really feel the need
+	 * for it
+	 */
+	if (!strcmp(np->name, "escc"))
+		return 1;
+
+	/* ESCC has bogus resources >= 3 */
+	if (index >= 3 && !(strcmp(np->name, "ch-a") &&
+			    strcmp(np->name, "ch-b")))
+		return 1;
+
+	/* Media bay has too many resources, keep only first one */
+	if (index > 0 && !strcmp(np->name, "media-bay"))
+		return 1;
+
+	/* Some older IDE resources have bogus sizes */
+	if (!(strcmp(np->name, "IDE") && strcmp(np->name, "ATA") &&
+	      strcmp(np->type, "ide") && strcmp(np->type, "ata"))) {
+		if (index == 0 && (res->end - res->start) > 0xfff)
+			res->end = res->start + 0xfff;
+		if (index == 1 && (res->end - res->start) > 0xff)
+			res->end = res->start + 0xff;
 	}
 	return 0;
 }
 
+static void macio_create_fixup_irq(struct macio_dev *dev, int index,
+				   unsigned int line)
+{
+	unsigned int irq;
 
-static void macio_setup_interrupts(struct macio_dev *dev)
+	irq = irq_create_mapping(NULL, line, 0);
+	if (irq != NO_IRQ) {
+		dev->interrupt[index].start = irq;
+		dev->interrupt[index].flags = IORESOURCE_IRQ;
+		dev->interrupt[index].name = dev->ofdev.dev.bus_id;
+	}
+	if (dev->n_interrupts <= index)
+		dev->n_interrupts = index + 1;
+}
+
+static void macio_add_missing_resources(struct macio_dev *dev)
 {
 	struct device_node *np = dev->ofdev.node;
-	int i,j;
+	unsigned int irq_base;
+
+	/* Gatwick has some missing interrupts on child nodes */
+	if (dev->bus->chip->type != macio_gatwick)
+		return;
 
-	/* For now, we use pre-parsed entries in the device-tree for
-	 * interrupt routing and addresses, but we should change that
-	 * to dynamically parsed entries and so get rid of most of the
-	 * clutter in struct device_node
+	/* irq_base is always 64 on gatwick. I have no cleaner way to get
+	 * that value from here at this point
 	 */
-	for (i = j = 0; i < np->n_intrs; i++) {
+	irq_base = 64;
+
+	/* Fix SCC */
+	if (strcmp(np->name, "ch-a") == 0) {
+		macio_create_fixup_irq(dev, 0, 15 + irq_base);
+		macio_create_fixup_irq(dev, 1,  4 + irq_base);
+		macio_create_fixup_irq(dev, 2,  5 + irq_base);
+		printk(KERN_INFO "macio: fixed SCC irqs on gatwick\n");
+	}
+
+	/* Fix media-bay */
+	if (strcmp(np->name, "media-bay") == 0) {
+		macio_create_fixup_irq(dev, 0, 29 + irq_base);
+		printk(KERN_INFO "macio: fixed media-bay irq on gatwick\n");
+	}
+
+	/* Fix left media bay childs */
+	if (dev->media_bay != NULL && strcmp(np->name, "floppy") == 0) {
+		macio_create_fixup_irq(dev, 0, 19 + irq_base);
+		macio_create_fixup_irq(dev, 1,  1 + irq_base);
+		printk(KERN_INFO "macio: fixed left floppy irqs\n");
+	}
+	if (dev->media_bay != NULL && strcasecmp(np->name, "ata4") == 0) {
+		macio_create_fixup_irq(dev, 0, 14 + irq_base);
+		macio_create_fixup_irq(dev, 0,  3 + irq_base);
+		printk(KERN_INFO "macio: fixed left ide irqs\n");
+	}
+}
+
+static void macio_setup_interrupts(struct macio_dev *dev)
+{
+	struct device_node *np = dev->ofdev.node;
+	unsigned int irq;
+	int i = 0, j = 0;
+
+	for (;;) {
 		struct resource *res = &dev->interrupt[j];
 
 		if (j >= MACIO_DEV_COUNT_IRQS)
 			break;
-		res->start = np->intrs[i].line;
-		res->flags = IORESOURCE_IO;
-		if (np->intrs[j].sense)
-			res->flags |= IORESOURCE_IRQ_LOWLEVEL;
-		else
-			res->flags |= IORESOURCE_IRQ_HIGHEDGE;
+		irq = irq_of_parse_and_map(np, i++);
+		if (irq == NO_IRQ)
+			break;
+		res->start = irq;
+		res->flags = IORESOURCE_IRQ;
 		res->name = dev->ofdev.dev.bus_id;
-		if (macio_resource_quirks(np, res, i))
+		if (macio_resource_quirks(np, res, i - 1)) {
 			memset(res, 0, sizeof(struct resource));
-		else
+			continue;
+		} else
 			j++;
 	}
 	dev->n_interrupts = j;
@@ -445,6 +498,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip,
 	/* Setup interrupts & resources */
 	macio_setup_interrupts(dev);
 	macio_setup_resources(dev, parent_res);
+	macio_add_missing_resources(dev);
 
 	/* Register with core */
 	if (of_device_register(&dev->ofdev) != 0) {
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index ff6d9bfdc3d2..f139a74696fe 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -497,8 +497,7 @@ int __init smu_init (void)
 	smu->doorbell = *data;
 	if (smu->doorbell < 0x50)
 		smu->doorbell += 0x50;
-	if (np->n_intrs > 0)
-		smu->db_irq = np->intrs[0].line;
+	smu->db_irq = irq_of_parse_and_map(np, 0);
 
 	of_node_put(np);
 
@@ -515,8 +514,7 @@ int __init smu_init (void)
 		smu->msg = *data;
 		if (smu->msg < 0x50)
 			smu->msg += 0x50;
-		if (np->n_intrs > 0)
-			smu->msg_irq = np->intrs[0].line;
+		smu->msg_irq = irq_of_parse_and_map(np, 0);
 		of_node_put(np);
 	} while(0);
 
diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 6501db50fb83..69d5452fd22f 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -34,13 +34,6 @@
 static volatile unsigned char __iomem *via;
 static DEFINE_SPINLOCK(cuda_lock);
 
-#ifdef CONFIG_MAC
-#define CUDA_IRQ IRQ_MAC_ADB
-#define eieio()
-#else
-#define CUDA_IRQ vias->intrs[0].line
-#endif
-
 /* VIA registers - spaced 0x200 bytes apart */
 #define RS		0x200		/* skip between registers */
 #define B		0		/* B-side data */
@@ -189,11 +182,24 @@ int __init find_via_cuda(void)
 
 static int __init via_cuda_start(void)
 {
+    unsigned int irq;
+
     if (via == NULL)
 	return -ENODEV;
 
-    if (request_irq(CUDA_IRQ, cuda_interrupt, 0, "ADB", cuda_interrupt)) {
-	printk(KERN_ERR "cuda_init: can't get irq %d\n", CUDA_IRQ);
+#ifdef CONFIG_MAC
+    irq = IRQ_MAC_ADB;
+#else /* CONFIG_MAC */
+    irq = irq_of_parse_and_map(vias, 0);
+    if (irq == NO_IRQ) {
+	printk(KERN_ERR "via-cuda: can't map interrupts for %s\n",
+	       vias->full_name);
+	return -ENODEV;
+    }
+#endif /* CONFIG_MAP */
+
+    if (request_irq(irq, cuda_interrupt, 0, "ADB", cuda_interrupt)) {
+	printk(KERN_ERR "via-cuda: can't request irq %d\n", irq);
 	return -EAGAIN;
     }
 
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index c1193d34ec9e..06ca80bfd6b9 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -64,10 +64,6 @@
 #include <asm/backlight.h>
 #endif
 
-#ifdef CONFIG_PPC32
-#include <asm/open_pic.h>
-#endif
-
 #include "via-pmu-event.h"
 
 /* Some compile options */
@@ -151,7 +147,7 @@ static int pmu_fully_inited = 0;
 static int pmu_has_adb;
 static struct device_node *gpio_node;
 static unsigned char __iomem *gpio_reg = NULL;
-static int gpio_irq = -1;
+static int gpio_irq = NO_IRQ;
 static int gpio_irq_enabled = -1;
 static volatile int pmu_suspended = 0;
 static spinlock_t pmu_lock;
@@ -403,22 +399,21 @@ static int __init pmu_init(void)
  */
 static int __init via_pmu_start(void)
 {
+	unsigned int irq;
+
 	if (vias == NULL)
 		return -ENODEV;
 
 	batt_req.complete = 1;
 
-#ifndef CONFIG_PPC_MERGE
-	if (pmu_kind == PMU_KEYLARGO_BASED)
-		openpic_set_irq_priority(vias->intrs[0].line,
-					 OPENPIC_PRIORITY_DEFAULT + 1);
-#endif
-
-	if (request_irq(vias->intrs[0].line, via_pmu_interrupt, 0, "VIA-PMU",
-			(void *)0)) {
-		printk(KERN_ERR "VIA-PMU: can't get irq %d\n",
-		       vias->intrs[0].line);
-		return -EAGAIN;
+	irq = irq_of_parse_and_map(vias, 0);
+	if (irq == NO_IRQ) {
+		printk(KERN_ERR "via-pmu: can't map interruptn");
+		return -ENODEV;
+	}
+	if (request_irq(irq, via_pmu_interrupt, 0, "VIA-PMU", (void *)0)) {
+		printk(KERN_ERR "via-pmu: can't request irq %d\n", irq);
+		return -ENODEV;
 	}
 
 	if (pmu_kind == PMU_KEYLARGO_BASED) {
@@ -426,10 +421,10 @@ static int __init via_pmu_start(void)
 		if (gpio_node == NULL)
 			gpio_node = of_find_node_by_name(NULL,
 							 "pmu-interrupt");
-		if (gpio_node && gpio_node->n_intrs > 0)
-			gpio_irq = gpio_node->intrs[0].line;
+		if (gpio_node)
+			gpio_irq = irq_of_parse_and_map(gpio_node, 0);
 
-		if (gpio_irq != -1) {
+		if (gpio_irq != NO_IRQ) {
 			if (request_irq(gpio_irq, gpio1_interrupt, 0,
 					"GPIO1 ADB", (void *)0))
 				printk(KERN_ERR "pmu: can't get irq %d"
diff --git a/drivers/net/mace.c b/drivers/net/mace.c
index f2c0bf89f0c7..29e4b5aa6ead 100644
--- a/drivers/net/mace.c
+++ b/drivers/net/mace.c
@@ -242,12 +242,12 @@ static int __devinit mace_probe(struct macio_dev *mdev, const struct of_device_i
 	}
 	rc = request_irq(mp->tx_dma_intr, mace_txdma_intr, 0, "MACE-txdma", dev);
 	if (rc) {
-		printk(KERN_ERR "MACE: can't get irq %d\n", mace->intrs[1].line);
+		printk(KERN_ERR "MACE: can't get irq %d\n", mp->tx_dma_intr);
 		goto err_free_irq;
 	}
 	rc = request_irq(mp->rx_dma_intr, mace_rxdma_intr, 0, "MACE-rxdma", dev);
 	if (rc) {
-		printk(KERN_ERR "MACE: can't get irq %d\n", mace->intrs[2].line);
+		printk(KERN_ERR "MACE: can't get irq %d\n", mp->rx_dma_intr);
 		goto err_free_tx_irq;
 	}
 
diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index 459c0231aef3..bfd2a22759eb 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -1443,8 +1443,8 @@ static int __init pmz_init_port(struct uart_pmac_port *uap)
 			uap->flags &= ~PMACZILOG_FLAG_HAS_DMA;
 			goto no_dma;
 		}
-		uap->tx_dma_irq = np->intrs[1].line;
-		uap->rx_dma_irq = np->intrs[2].line;
+		uap->tx_dma_irq = irq_of_parse_and_map(np, 1);
+		uap->rx_dma_irq = irq_of_parse_and_map(np, 2);
 	}
 no_dma:
 
@@ -1491,7 +1491,7 @@ no_dma:
 	 * Init remaining bits of "port" structure
 	 */
 	uap->port.iotype = UPIO_MEM;
-	uap->port.irq = np->intrs[0].line;
+	uap->port.irq = irq_of_parse_and_map(np, 0);
 	uap->port.uartclk = ZS_CLOCK;
 	uap->port.fifosize = 1;
 	uap->port.ops = &pmz_pops;
diff --git a/include/asm-powerpc/i8259.h b/include/asm-powerpc/i8259.h
index ff31cb90325f..c80e113052cd 100644
--- a/include/asm-powerpc/i8259.h
+++ b/include/asm-powerpc/i8259.h
@@ -4,8 +4,13 @@
 
 #include <linux/irq.h>
 
+#ifdef CONFIG_PPC_MERGE
+extern void i8259_init(struct device_node *node, unsigned long intack_addr);
+extern unsigned int i8259_irq(struct pt_regs *regs);
+#else
 extern void i8259_init(unsigned long intack_addr, int offset);
 extern int i8259_irq(struct pt_regs *regs);
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_I8259_H */
diff --git a/include/asm-powerpc/irq.h b/include/asm-powerpc/irq.h
index 13fa2ef38dc7..e05754752028 100644
--- a/include/asm-powerpc/irq.h
+++ b/include/asm-powerpc/irq.h
@@ -9,26 +9,14 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/config.h>
 #include <linux/threads.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
 
 #include <asm/types.h>
 #include <asm/atomic.h>
 
-/* this number is used when no interrupt has been assigned */
-#define NO_IRQ			(-1)
-
-/*
- * These constants are used for passing information about interrupt
- * signal polarity and level/edge sensing to the low-level PIC chip
- * drivers.
- */
-#define IRQ_SENSE_MASK		0x1
-#define IRQ_SENSE_LEVEL		0x1	/* interrupt on active level */
-#define IRQ_SENSE_EDGE		0x0	/* interrupt triggered by edge */
-
-#define IRQ_POLARITY_MASK	0x2
-#define IRQ_POLARITY_POSITIVE	0x2	/* high level or low->high edge */
-#define IRQ_POLARITY_NEGATIVE	0x0	/* low level or high->low edge */
 
 #define get_irq_desc(irq) (&irq_desc[(irq)])
 
@@ -36,50 +24,325 @@
 #define for_each_irq(i) \
 	for ((i) = 0; (i) < NR_IRQS; ++(i))
 
-#ifdef CONFIG_PPC64
+extern atomic_t ppc_n_lost_interrupts;
 
-/*
- * Maximum number of interrupt sources that we can handle.
+#ifdef CONFIG_PPC_MERGE
+
+/* This number is used when no interrupt has been assigned */
+#define NO_IRQ			(0)
+
+/* This is a special irq number to return from get_irq() to tell that
+ * no interrupt happened _and_ ignore it (don't count it as bad). Some
+ * platforms like iSeries rely on that.
  */
+#define NO_IRQ_IGNORE		((unsigned int)-1)
+
+/* Total number of virq in the platform (make it a CONFIG_* option ? */
 #define NR_IRQS		512
 
-/* Interrupt numbers are virtual in case they are sparsely
- * distributed by the hardware.
+/* Number of irqs reserved for the legacy controller */
+#define NUM_ISA_INTERRUPTS	16
+
+/* This type is the placeholder for a hardware interrupt number. It has to
+ * be big enough to enclose whatever representation is used by a given
+ * platform.
+ */
+typedef unsigned long irq_hw_number_t;
+
+/* Interrupt controller "host" data structure. This could be defined as a
+ * irq domain controller. That is, it handles the mapping between hardware
+ * and virtual interrupt numbers for a given interrupt domain. The host
+ * structure is generally created by the PIC code for a given PIC instance
+ * (though a host can cover more than one PIC if they have a flat number
+ * model). It's the host callbacks that are responsible for setting the
+ * irq_chip on a given irq_desc after it's been mapped.
+ *
+ * The host code and data structures are fairly agnostic to the fact that
+ * we use an open firmware device-tree. We do have references to struct
+ * device_node in two places: in irq_find_host() to find the host matching
+ * a given interrupt controller node, and of course as an argument to its
+ * counterpart host->ops->match() callback. However, those are treated as
+ * generic pointers by the core and the fact that it's actually a device-node
+ * pointer is purely a convention between callers and implementation. This
+ * code could thus be used on other architectures by replacing those two
+ * by some sort of arch-specific void * "token" used to identify interrupt
+ * controllers.
+ */
+struct irq_host;
+struct radix_tree_root;
+
+/* Functions below are provided by the host and called whenever a new mapping
+ * is created or an old mapping is disposed. The host can then proceed to
+ * whatever internal data structures management is required. It also needs
+ * to setup the irq_desc when returning from map().
+ */
+struct irq_host_ops {
+	/* Match an interrupt controller device node to a host, returns
+	 * 1 on a match
+	 */
+	int (*match)(struct irq_host *h, struct device_node *node);
+
+	/* Create or update a mapping between a virtual irq number and a hw
+	 * irq number. This can be called several times for the same mapping
+	 * but with different flags, though unmap shall always be called
+	 * before the virq->hw mapping is changed.
+	 */
+	int (*map)(struct irq_host *h, unsigned int virq,
+		   irq_hw_number_t hw, unsigned int flags);
+
+	/* Dispose of such a mapping */
+	void (*unmap)(struct irq_host *h, unsigned int virq);
+
+	/* Translate device-tree interrupt specifier from raw format coming
+	 * from the firmware to a irq_hw_number_t (interrupt line number) and
+	 * trigger flags that can be passed to irq_create_mapping().
+	 * If no translation is provided, raw format is assumed to be one cell
+	 * for interrupt line and default sense.
+	 */
+	int (*xlate)(struct irq_host *h, struct device_node *ctrler,
+		     u32 *intspec, unsigned int intsize,
+		     irq_hw_number_t *out_hwirq, unsigned int *out_flags);
+};
+
+struct irq_host {
+	struct list_head	link;
+
+	/* type of reverse mapping technique */
+	unsigned int		revmap_type;
+#define IRQ_HOST_MAP_LEGACY     0 /* legacy 8259, gets irqs 1..15 */
+#define IRQ_HOST_MAP_NOMAP	1 /* no fast reverse mapping */
+#define IRQ_HOST_MAP_LINEAR	2 /* linear map of interrupts */
+#define IRQ_HOST_MAP_TREE	3 /* radix tree */
+	union {
+		struct {
+			unsigned int size;
+			unsigned int *revmap;
+		} linear;
+		struct radix_tree_root tree;
+	} revmap_data;
+	struct irq_host_ops	*ops;
+	void			*host_data;
+	irq_hw_number_t		inval_irq;
+};
+
+/* The main irq map itself is an array of NR_IRQ entries containing the
+ * associate host and irq number. An entry with a host of NULL is free.
+ * An entry can be allocated if it's free, the allocator always then sets
+ * hwirq first to the host's invalid irq number and then fills ops.
+ */
+struct irq_map_entry {
+	irq_hw_number_t	hwirq;
+	struct irq_host	*host;
+};
+
+extern struct irq_map_entry irq_map[NR_IRQS];
+
+
+/***
+ * irq_alloc_host - Allocate a new irq_host data structure
+ * @node: device-tree node of the interrupt controller
+ * @revmap_type: type of reverse mapping to use
+ * @revmap_arg: for IRQ_HOST_MAP_LINEAR linear only: size of the map
+ * @ops: map/unmap host callbacks
+ * @inval_irq: provide a hw number in that host space that is always invalid
+ *
+ * Allocates and initialize and irq_host structure. Note that in the case of
+ * IRQ_HOST_MAP_LEGACY, the map() callback will be called before this returns
+ * for all legacy interrupts except 0 (which is always the invalid irq for
+ * a legacy controller). For a IRQ_HOST_MAP_LINEAR, the map is allocated by
+ * this call as well. For a IRQ_HOST_MAP_TREE, the radix tree will be allocated
+ * later during boot automatically (the reverse mapping will use the slow path
+ * until that happens).
+ */
+extern struct irq_host *irq_alloc_host(unsigned int revmap_type,
+				       unsigned int revmap_arg,
+				       struct irq_host_ops *ops,
+				       irq_hw_number_t inval_irq);
+
+
+/***
+ * irq_find_host - Locates a host for a given device node
+ * @node: device-tree node of the interrupt controller
+ */
+extern struct irq_host *irq_find_host(struct device_node *node);
+
+
+/***
+ * irq_set_default_host - Set a "default" host
+ * @host: default host pointer
+ *
+ * For convenience, it's possible to set a "default" host that will be used
+ * whenever NULL is passed to irq_create_mapping(). It makes life easier for
+ * platforms that want to manipulate a few hard coded interrupt numbers that
+ * aren't properly represented in the device-tree.
+ */
+extern void irq_set_default_host(struct irq_host *host);
+
+
+/***
+ * irq_set_virq_count - Set the maximum number of virt irqs
+ * @count: number of linux virtual irqs, capped with NR_IRQS
+ *
+ * This is mainly for use by platforms like iSeries who want to program
+ * the virtual irq number in the controller to avoid the reverse mapping
+ */
+extern void irq_set_virq_count(unsigned int count);
+
+
+/***
+ * irq_create_mapping - Map a hardware interrupt into linux virq space
+ * @host: host owning this hardware interrupt or NULL for default host
+ * @hwirq: hardware irq number in that host space
+ * @flags: flags passed to the controller. contains the trigger type among
+ *         others. Use IRQ_TYPE_* defined in include/linux/irq.h
+ *
+ * Only one mapping per hardware interrupt is permitted. Returns a linux
+ * virq number. The flags can be used to provide sense information to the
+ * controller (typically extracted from the device-tree). If no information
+ * is passed, the controller defaults will apply (for example, xics can only
+ * do edge so flags are irrelevant for some pseries specific irqs).
+ *
+ * The device-tree generally contains the trigger info in an encoding that is
+ * specific to a given type of controller. In that case, you can directly use
+ * host->ops->trigger_xlate() to translate that.
+ *
+ * It is recommended that new PICs that don't have existing OF bindings chose
+ * to use a representation of triggers identical to linux.
+ */
+extern unsigned int irq_create_mapping(struct irq_host *host,
+				       irq_hw_number_t hwirq,
+				       unsigned int flags);
+
+
+/***
+ * irq_dispose_mapping - Unmap an interrupt
+ * @virq: linux virq number of the interrupt to unmap
  */
-extern unsigned int virt_irq_to_real_map[NR_IRQS];
+extern void irq_dispose_mapping(unsigned int virq);
 
-/* The maximum virtual IRQ number that we support.  This
- * can be set by the platform and will be reduced by the
- * value of __irq_offset_value.  It defaults to and is
- * capped by (NR_IRQS - 1).
+/***
+ * irq_find_mapping - Find a linux virq from an hw irq number.
+ * @host: host owning this hardware interrupt
+ * @hwirq: hardware irq number in that host space
+ *
+ * This is a slow path, for use by generic code. It's expected that an
+ * irq controller implementation directly calls the appropriate low level
+ * mapping function.
  */
-extern unsigned int virt_irq_max;
+extern unsigned int irq_find_mapping(struct irq_host *host,
+				     irq_hw_number_t hwirq);
 
-/* Create a mapping for a real_irq if it doesn't already exist.
- * Return the virtual irq as a convenience.
+
+/***
+ * irq_radix_revmap - Find a linux virq from a hw irq number.
+ * @host: host owning this hardware interrupt
+ * @hwirq: hardware irq number in that host space
+ *
+ * This is a fast path, for use by irq controller code that uses radix tree
+ * revmaps
+ */
+extern unsigned int irq_radix_revmap(struct irq_host *host,
+				     irq_hw_number_t hwirq);
+
+/***
+ * irq_linear_revmap - Find a linux virq from a hw irq number.
+ * @host: host owning this hardware interrupt
+ * @hwirq: hardware irq number in that host space
+ *
+ * This is a fast path, for use by irq controller code that uses linear
+ * revmaps. It does fallback to the slow path if the revmap doesn't exist
+ * yet and will create the revmap entry with appropriate locking
+ */
+
+extern unsigned int irq_linear_revmap(struct irq_host *host,
+				      irq_hw_number_t hwirq);
+
+
+
+/***
+ * irq_alloc_virt - Allocate virtual irq numbers
+ * @host: host owning these new virtual irqs
+ * @count: number of consecutive numbers to allocate
+ * @hint: pass a hint number, the allocator will try to use a 1:1 mapping
+ *
+ * This is a low level function that is used internally by irq_create_mapping()
+ * and that can be used by some irq controllers implementations for things
+ * like allocating ranges of numbers for MSIs. The revmaps are left untouched.
  */
-int virt_irq_create_mapping(unsigned int real_irq);
-void virt_irq_init(void);
+extern unsigned int irq_alloc_virt(struct irq_host *host,
+				   unsigned int count,
+				   unsigned int hint);
+
+/***
+ * irq_free_virt - Free virtual irq numbers
+ * @virq: virtual irq number of the first interrupt to free
+ * @count: number of interrupts to free
+ *
+ * This function is the opposite of irq_alloc_virt. It will not clear reverse
+ * maps, this should be done previously by unmap'ing the interrupt. In fact,
+ * all interrupts covered by the range being freed should have been unmapped
+ * prior to calling this.
+ */
+extern void irq_free_virt(unsigned int virq, unsigned int count);
+
+
+/* -- OF helpers -- */
+
+/* irq_create_of_mapping - Map a hardware interrupt into linux virq space
+ * @controller: Device node of the interrupt controller
+ * @inspec: Interrupt specifier from the device-tree
+ * @intsize: Size of the interrupt specifier from the device-tree
+ *
+ * This function is identical to irq_create_mapping except that it takes
+ * as input informations straight from the device-tree (typically the results
+ * of the of_irq_map_*() functions
+ */
+extern unsigned int irq_create_of_mapping(struct device_node *controller,
+					  u32 *intspec, unsigned int intsize);
+
+
+/* irq_of_parse_and_map - Parse nad Map an interrupt into linux virq space
+ * @device: Device node of the device whose interrupt is to be mapped
+ * @index: Index of the interrupt to map
+ *
+ * This function is a wrapper that chains of_irq_map_one() and
+ * irq_create_of_mapping() to make things easier to callers
+ */
+extern unsigned int irq_of_parse_and_map(struct device_node *dev, int index);
+
+/* -- End OF helpers -- */
 
-static inline unsigned int virt_irq_to_real(unsigned int virt_irq)
+/***
+ * irq_early_init - Init irq remapping subsystem
+ */
+extern void irq_early_init(void);
+
+static __inline__ int irq_canonicalize(int irq)
 {
-	return virt_irq_to_real_map[virt_irq];
+	return irq;
 }
 
-extern unsigned int real_irq_to_virt_slowpath(unsigned int real_irq);
+
+#else /* CONFIG_PPC_MERGE */
+
+/* This number is used when no interrupt has been assigned */
+#define NO_IRQ			(-1)
+#define NO_IRQ_IGNORE		(-2)
+
 
 /*
- * List of interrupt controllers.
+ * These constants are used for passing information about interrupt
+ * signal polarity and level/edge sensing to the low-level PIC chip
+ * drivers.
  */
-#define IC_INVALID    0
-#define IC_OPEN_PIC   1
-#define IC_PPC_XIC    2
-#define IC_CELL_PIC   3
-#define IC_ISERIES    4
+#define IRQ_SENSE_MASK		0x1
+#define IRQ_SENSE_LEVEL		0x1	/* interrupt on active level */
+#define IRQ_SENSE_EDGE		0x0	/* interrupt triggered by edge */
 
-extern u64 ppc64_interrupt_controller;
+#define IRQ_POLARITY_MASK	0x2
+#define IRQ_POLARITY_POSITIVE	0x2	/* high level or low->high edge */
+#define IRQ_POLARITY_NEGATIVE	0x0	/* low level or high->low edge */
 
-#else /* 32-bit */
 
 #if defined(CONFIG_40x)
 #include <asm/ibm4xx.h>
@@ -512,19 +775,11 @@ extern u64 ppc64_interrupt_controller;
 
 #endif /* CONFIG_8260 */
 
-#endif
+#endif /* Whatever way too big #ifdef */
 
-#ifndef CONFIG_PPC_MERGE
 #define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
 /* pedantic: these are long because they are used with set_bit --RR */
 extern unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
-#endif
-
-extern atomic_t ppc_n_lost_interrupts;
-
-#define virt_irq_create_mapping(x)	(x)
-
-#endif
 
 /*
  * Because many systems have two overlapping names spaces for
@@ -563,6 +818,7 @@ static __inline__ int irq_canonicalize(int irq)
 		irq = 9;
 	return irq;
 }
+#endif /* CONFIG_PPC_MERGE */
 
 extern int distribute_irqs;
 
diff --git a/include/asm-powerpc/machdep.h b/include/asm-powerpc/machdep.h
index eba133d149a7..c17c13742401 100644
--- a/include/asm-powerpc/machdep.h
+++ b/include/asm-powerpc/machdep.h
@@ -97,7 +97,7 @@ struct machdep_calls {
 	void		(*show_percpuinfo)(struct seq_file *m, int i);
 
 	void		(*init_IRQ)(void);
-	int		(*get_irq)(struct pt_regs *);
+	unsigned int	(*get_irq)(struct pt_regs *);
 #ifdef CONFIG_KEXEC
 	void		(*kexec_cpu_down)(int crash_shutdown, int secondary);
 #endif
diff --git a/include/asm-powerpc/mpic.h b/include/asm-powerpc/mpic.h
index a2277cb77ddc..eb241c99c457 100644
--- a/include/asm-powerpc/mpic.h
+++ b/include/asm-powerpc/mpic.h
@@ -129,6 +129,12 @@ struct mpic_irq_fixup
 /* The instance data of a given MPIC */
 struct mpic
 {
+	/* The device node of the interrupt controller */
+	struct device_node	*of_node;
+
+	/* The remapper for this MPIC */
+	struct irq_host		*irqhost;
+
 	/* The "linux" controller struct */
 	struct irq_chip		hc_irq;
 #ifdef CONFIG_MPIC_BROKEN_U3
@@ -144,16 +150,12 @@ struct mpic
 	unsigned int		isu_size;
 	unsigned int		isu_shift;
 	unsigned int		isu_mask;
-	/* Offset of irq vector numbers */
-	unsigned int		irq_offset;	
 	unsigned int		irq_count;
-	/* Offset of ipi vector numbers */
-	unsigned int		ipi_offset;
 	/* Number of sources */
 	unsigned int		num_sources;
 	/* Number of CPUs */
 	unsigned int		num_cpus;
-	/* senses array */
+	/* default senses array */
 	unsigned char		*senses;
 	unsigned int		senses_count;
 
@@ -209,14 +211,11 @@ struct mpic
  * The values in the array start at the first source of the MPIC,
  * that is senses[0] correspond to linux irq "irq_offset".
  */
-extern struct mpic *mpic_alloc(unsigned long phys_addr,
+extern struct mpic *mpic_alloc(struct device_node *node,
+			       unsigned long phys_addr,
 			       unsigned int flags,
 			       unsigned int isu_size,
-			       unsigned int irq_offset,
 			       unsigned int irq_count,
-			       unsigned int ipi_offset,
-			       unsigned char *senses,
-			       unsigned int senses_num,
 			       const char *name);
 
 /* Assign ISUs, to call before mpic_init()
@@ -228,6 +227,22 @@ extern struct mpic *mpic_alloc(unsigned long phys_addr,
 extern void mpic_assign_isu(struct mpic *mpic, unsigned int isu_num,
 			    unsigned long phys_addr);
 
+/* Set default sense codes
+ *
+ * @mpic:	controller
+ * @senses:	array of sense codes
+ * @count:	size of above array
+ *
+ * Optionally provide an array (indexed on hardware interrupt numbers
+ * for this MPIC) of default sense codes for the chip. Those are linux
+ * sense codes IRQ_TYPE_*
+ *
+ * The driver gets ownership of the pointer, don't dispose of it or
+ * anything like that. __init only.
+ */
+extern void mpic_set_default_senses(struct mpic *mpic, u8 *senses, int count);
+
+
 /* Initialize the controller. After this has been called, none of the above
  * should be called again for this mpic
  */
@@ -269,9 +284,9 @@ extern void mpic_send_ipi(unsigned int ipi_no, unsigned int cpu_mask);
 void smp_mpic_message_pass(int target, int msg);
 
 /* Fetch interrupt from a given mpic */
-extern int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs);
+extern unsigned int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs);
 /* This one gets to the primary mpic */
-extern int mpic_get_irq(struct pt_regs *regs);
+extern unsigned int mpic_get_irq(struct pt_regs *regs);
 
 /* Set the EPIC clock ratio */
 void mpic_set_clk_ratio(struct mpic *mpic, u32 clock_ratio);
@@ -279,8 +294,5 @@ void mpic_set_clk_ratio(struct mpic *mpic, u32 clock_ratio);
 /* Enable/Disable EPIC serial interrupt mode */
 void mpic_set_serial_int(struct mpic *mpic, int enable);
 
-/* global mpic for pSeries */
-extern struct mpic *pSeries_mpic;
-
 #endif /* __KERNEL__ */
 #endif	/* _ASM_POWERPC_MPIC_H */
diff --git a/include/asm-powerpc/prom.h b/include/asm-powerpc/prom.h
index 48bef401bc19..b095a285c84b 100644
--- a/include/asm-powerpc/prom.h
+++ b/include/asm-powerpc/prom.h
@@ -64,11 +64,6 @@ struct boot_param_header
 typedef u32 phandle;
 typedef u32 ihandle;
 
-struct interrupt_info {
-	int	line;
-	int	sense;		/* +ve/-ve logic, edge or level, etc. */
-};
-
 struct property {
 	char	*name;
 	int	length;
@@ -81,8 +76,6 @@ struct device_node {
 	char	*type;
 	phandle	node;
 	phandle linux_phandle;
-	int	n_intrs;
-	struct	interrupt_info *intrs;
 	char	*full_name;
 
 	struct	property *properties;
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 9609d3ee8798..c02d105d8294 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -117,6 +117,7 @@ struct spu {
 	struct list_head sched_list;
 	int number;
 	int nid;
+	unsigned int irqs[3];
 	u32 isrc;
 	u32 node;
 	u64 flags;
diff --git a/sound/aoa/core/snd-aoa-gpio-feature.c b/sound/aoa/core/snd-aoa-gpio-feature.c
index bab97547a052..7ae0c0bdfad8 100644
--- a/sound/aoa/core/snd-aoa-gpio-feature.c
+++ b/sound/aoa/core/snd-aoa-gpio-feature.c
@@ -112,12 +112,7 @@ static struct device_node *get_gpio(char *name,
 
 static void get_irq(struct device_node * np, int *irqptr)
 {
-	*irqptr = -1;
-	if (!np)
-		return;
-	if (np->n_intrs != 1)
-		return;
-	*irqptr = np->intrs[0].line;
+	*irqptr = irq_of_parse_and_map(np, 0);
 }
 
 /* 0x4 is outenable, 0x1 is out, thus 4 or 5 */
diff --git a/sound/aoa/soundbus/i2sbus/i2sbus-core.c b/sound/aoa/soundbus/i2sbus/i2sbus-core.c
index f268dacdaa00..01c0724335a3 100644
--- a/sound/aoa/soundbus/i2sbus/i2sbus-core.c
+++ b/sound/aoa/soundbus/i2sbus/i2sbus-core.c
@@ -129,7 +129,7 @@ static int i2sbus_add_dev(struct macio_dev *macio,
 	if (strncmp(np->name, "i2s-", 4))
 		return 0;
 
-	if (np->n_intrs != 3)
+	if (macio_irq_count(macio) != 3)
 		return 0;
 
 	dev = kzalloc(sizeof(struct i2sbus_dev), GFP_KERNEL);
@@ -183,9 +183,10 @@ static int i2sbus_add_dev(struct macio_dev *macio,
 		snprintf(dev->rnames[i], sizeof(dev->rnames[i]), rnames[i], np->name);
 	}
 	for (i=0;i<3;i++) {
-		if (request_irq(np->intrs[i].line, ints[i], 0, dev->rnames[i], dev))
+		if (request_irq(macio_irq(macio, i), ints[i], 0,
+				dev->rnames[i], dev))
 			goto err;
-		dev->interrupts[i] = np->intrs[i].line;
+		dev->interrupts[i] = macio_irq(macio, i);
 	}
 
 	for (i=0;i<3;i++) {
diff --git a/sound/oss/dmasound/dmasound_awacs.c b/sound/oss/dmasound/dmasound_awacs.c
index de454ca39226..4359903f4376 100644
--- a/sound/oss/dmasound/dmasound_awacs.c
+++ b/sound/oss/dmasound/dmasound_awacs.c
@@ -374,10 +374,7 @@ setup_audio_gpio(const char *name, const char* compatible, int *gpio_addr, int*
 		*gpio_pol = *pp;
 	else
 		*gpio_pol = 1;
-	if (np->n_intrs > 0)
-		return np->intrs[0].line;
-	
-	return 0;
+	return irq_of_parse_and_map(np, 0);
 }
 
 static inline void
@@ -2864,14 +2861,13 @@ printk("dmasound_pmac: couldn't find a Codec we can handle\n");
 	 * other info if necessary (early AWACS we want to read chip ids)
 	 */
 
-	if (of_get_address(io, 2, NULL, NULL) == NULL || io->n_intrs < 3) {
+	if (of_get_address(io, 2, NULL, NULL) == NULL) {
 		/* OK - maybe we need to use the 'awacs' node (on earlier
 		 * machines).
 		 */
 		if (awacs_node) {
 			io = awacs_node ;
-			if (of_get_address(io, 2, NULL, NULL) == NULL ||
-			    io->n_intrs < 3) {
+			if (of_get_address(io, 2, NULL, NULL) == NULL) {
 				printk("dmasound_pmac: can't use %s\n",
 				       io->full_name);
 				return -ENODEV;
@@ -2940,9 +2936,9 @@ printk("dmasound_pmac: couldn't find a Codec we can handle\n");
 	if (awacs_revision == AWACS_SCREAMER && awacs)
 		awacs_recalibrate();
 
-	awacs_irq = io->intrs[0].line;
-	awacs_tx_irq = io->intrs[1].line;
-	awacs_rx_irq = io->intrs[2].line;
+	awacs_irq = irq_of_parse_and_map(io, 0);
+	awacs_tx_irq = irq_of_parse_and_map(io, 1);
+	awacs_rx_irq = irq_of_parse_and_map(io, 2);
 
 	/* Hack for legacy crap that will be killed someday */
 	awacs_node = io;
diff --git a/sound/ppc/pmac.c b/sound/ppc/pmac.c
index 90db9a1d1e0a..641430631505 100644
--- a/sound/ppc/pmac.c
+++ b/sound/ppc/pmac.c
@@ -1120,6 +1120,7 @@ int __init snd_pmac_new(struct snd_card *card, struct snd_pmac **chip_return)
 	struct snd_pmac *chip;
 	struct device_node *np;
 	int i, err;
+	unsigned int irq;
 	unsigned long ctrl_addr, txdma_addr, rxdma_addr;
 	static struct snd_device_ops ops = {
 		.dev_free =	snd_pmac_dev_free,
@@ -1153,10 +1154,6 @@ int __init snd_pmac_new(struct snd_card *card, struct snd_pmac **chip_return)
 	if (chip->is_k2) {
 		static char *rnames[] = {
 			"Sound Control", "Sound DMA" };
-		if (np->n_intrs < 3) {
-			err = -ENODEV;
-			goto __error;
-		}
 		for (i = 0; i < 2; i ++) {
 			if (of_address_to_resource(np->parent, i,
 						   &chip->rsrc[i])) {
@@ -1185,10 +1182,6 @@ int __init snd_pmac_new(struct snd_card *card, struct snd_pmac **chip_return)
 	} else {
 		static char *rnames[] = {
 			"Sound Control", "Sound Tx DMA", "Sound Rx DMA" };
-		if (np->n_intrs < 3) {
-			err = -ENODEV;
-			goto __error;
-		}
 		for (i = 0; i < 3; i ++) {
 			if (of_address_to_resource(np, i,
 						   &chip->rsrc[i])) {
@@ -1220,28 +1213,30 @@ int __init snd_pmac_new(struct snd_card *card, struct snd_pmac **chip_return)
 	chip->playback.dma = ioremap(txdma_addr, 0x100);
 	chip->capture.dma = ioremap(rxdma_addr, 0x100);
 	if (chip->model <= PMAC_BURGUNDY) {
-		if (request_irq(np->intrs[0].line, snd_pmac_ctrl_intr, 0,
+		irq = irq_of_parse_and_map(np, 0);
+		if (request_irq(irq, snd_pmac_ctrl_intr, 0,
 				"PMac", (void*)chip)) {
-			snd_printk(KERN_ERR "pmac: unable to grab IRQ %d\n", np->intrs[0].line);
+			snd_printk(KERN_ERR "pmac: unable to grab IRQ %d\n",
+				   irq);
 			err = -EBUSY;
 			goto __error;
 		}
-		chip->irq = np->intrs[0].line;
+		chip->irq = irq;
 	}
-	if (request_irq(np->intrs[1].line, snd_pmac_tx_intr, 0,
-			"PMac Output", (void*)chip)) {
-		snd_printk(KERN_ERR "pmac: unable to grab IRQ %d\n", np->intrs[1].line);
+	irq = irq_of_parse_and_map(np, 1);
+	if (request_irq(irq, snd_pmac_tx_intr, 0, "PMac Output", (void*)chip)){
+		snd_printk(KERN_ERR "pmac: unable to grab IRQ %d\n", irq);
 		err = -EBUSY;
 		goto __error;
 	}
-	chip->tx_irq = np->intrs[1].line;
-	if (request_irq(np->intrs[2].line, snd_pmac_rx_intr, 0,
-			"PMac Input", (void*)chip)) {
-		snd_printk(KERN_ERR "pmac: unable to grab IRQ %d\n", np->intrs[2].line);
+	chip->tx_irq = irq;
+	irq = irq_of_parse_and_map(np, 2);
+	if (request_irq(irq, snd_pmac_rx_intr, 0, "PMac Input", (void*)chip)) {
+		snd_printk(KERN_ERR "pmac: unable to grab IRQ %d\n", irq);
 		err = -EBUSY;
 		goto __error;
 	}
-	chip->rx_irq = np->intrs[2].line;
+	chip->rx_irq = irq;
 
 	snd_pmac_sound_feature(chip, 1);
 
diff --git a/sound/ppc/tumbler.c b/sound/ppc/tumbler.c
index 70e4ebc70260..692c61177678 100644
--- a/sound/ppc/tumbler.c
+++ b/sound/ppc/tumbler.c
@@ -1121,7 +1121,7 @@ static long tumbler_find_device(const char *device, const char *platform,
 	DBG("(I) GPIO device %s found, offset: %x, active state: %d !\n",
 	    device, gp->addr, gp->active_state);
 
-	return (node->n_intrs > 0) ? node->intrs[0].line : 0;
+	return irq_of_parse_and_map(node, 0);
 }
 
 /* reset audio */
@@ -1264,16 +1264,16 @@ static int __init tumbler_init(struct snd_pmac *chip)
 				    &mix->line_mute, 1);
 	irq = tumbler_find_device("headphone-detect",
 				  NULL, &mix->hp_detect, 0);
-	if (irq < 0)
+	if (irq <= NO_IRQ)
 		irq = tumbler_find_device("headphone-detect",
 					  NULL, &mix->hp_detect, 1);
-	if (irq < 0)
+	if (irq <= NO_IRQ)
 		irq = tumbler_find_device("keywest-gpio15",
 					  NULL, &mix->hp_detect, 1);
 	mix->headphone_irq = irq;
  	irq = tumbler_find_device("line-output-detect",
 				  NULL, &mix->line_detect, 0);
- 	if (irq < 0)
+ 	if (irq <= NO_IRQ)
 		irq = tumbler_find_device("line-output-detect",
 					  NULL, &mix->line_detect, 1);
 	mix->lineout_irq = irq;
-- 
cgit v1.2.3


From 63104eec234bdecb55fd9c15467ae00d0a3f42ac Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@mars.ravnborg.org>
Date: Mon, 3 Jul 2006 23:30:54 +0200
Subject: kbuild: introduce utsrelease.h

include/linux/version.h contained both actual KERNEL version
and UTS_RELEASE that contains a subset from git SHA1 for when
kernel was compiled as part of a git repository.
This had the unfortunate side-effect that all files including version.h
would be recompiled when some git changes was made due to changes SHA1.
Split it out so we keep independent parts in separate files.

Also update checkversion.pl script to no longer check for UTS_RELEASE.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 Makefile                                       | 30 +++++++++++++++-----------
 arch/alpha/boot/bootp.c                        |  2 +-
 arch/alpha/boot/bootpz.c                       |  2 +-
 arch/alpha/boot/main.c                         |  2 +-
 arch/frv/kernel/setup.c                        |  2 +-
 arch/i386/boot/setup.S                         |  2 +-
 arch/powerpc/platforms/chrp/setup.c            |  2 +-
 arch/powerpc/platforms/powermac/bootx_init.c   |  2 +-
 arch/ppc/syslib/btext.c                        |  2 +-
 arch/x86_64/boot/setup.S                       |  2 +-
 drivers/net/wireless/bcm43xx/bcm43xx_ethtool.c |  2 +-
 include/linux/vermagic.h                       |  2 +-
 init/version.c                                 |  1 +
 scripts/checkversion.pl                        |  7 +++---
 14 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/Makefile b/Makefile
index 1cc793ff42d4..b36aeb617bd2 100644
--- a/Makefile
+++ b/Makefile
@@ -812,8 +812,8 @@ endif
 # prepare2 creates a makefile if using a separate output directory
 prepare2: prepare3 outputmakefile
 
-prepare1: prepare2 include/linux/version.h include/asm \
-                   include/config/auto.conf
+prepare1: prepare2 include/linux/version.h include/linux/utsrelease.h \
+                   include/asm include/config/auto.conf
 ifneq ($(KBUILD_MODULES),)
 	$(Q)mkdir -p $(MODVERDIR)
 	$(Q)rm -f $(MODVERDIR)/*
@@ -848,21 +848,26 @@ include/asm:
 # needs to be updated, so this check is forced on all builds
 
 uts_len := 64
+define filechk_utsrelease.h
+	if [ `echo -n "$(KERNELRELEASE)" | wc -c ` -gt $(uts_len) ]; then \
+	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2;    \
+	  exit 1;                                                         \
+	fi;                                                               \
+	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
+endef
 
 define filechk_version.h
-	if [ `echo -n "$(KERNELRELEASE)" | wc -c ` -gt $(uts_len) ]; then \
-	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \
-	  exit 1; \
-	fi; \
-	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; \
-	  echo \#define LINUX_VERSION_CODE `expr $(VERSION) \\* 65536 + $(PATCHLEVEL) \\* 256 + $(SUBLEVEL)`; \
-	 echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))'; \
-	)
+	(echo \#define LINUX_VERSION_CODE $(shell                             \
+	expr $(VERSION) \* 65536 + $(PATCHLEVEL) \* 256 + $(SUBLEVEL));     \
+	echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))';)
 endef
 
-include/linux/version.h: $(srctree)/Makefile include/config/kernel.release FORCE
+include/linux/version.h: $(srctree)/Makefile FORCE
 	$(call filechk,version.h)
 
+include/linux/utsrelease.h: include/config/kernel.release FORCE
+	$(call filechk,utsrelease.h)
+
 # ---------------------------------------------------------------------------
 
 PHONY += depend dep
@@ -955,7 +960,8 @@ CLEAN_FILES +=	vmlinux System.map \
 # Directories & files removed with 'make mrproper'
 MRPROPER_DIRS  += include/config include2
 MRPROPER_FILES += .config .config.old include/asm .version .old_version \
-                  include/linux/autoconf.h include/linux/version.h \
+                  include/linux/autoconf.h include/linux/version.h      \
+                  include/linux/utsrelease.h                            \
 		  Module.symvers tags TAGS cscope*
 
 # clean - Delete most, but leave enough to build external modules
diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c
index ec53c28e33de..3af21c789339 100644
--- a/arch/alpha/boot/bootp.c
+++ b/arch/alpha/boot/bootp.c
@@ -9,7 +9,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/mm.h>
 
 #include <asm/system.h>
diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c
index a6657f2cf9bd..4307bde80a35 100644
--- a/arch/alpha/boot/bootpz.c
+++ b/arch/alpha/boot/bootpz.c
@@ -11,7 +11,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/mm.h>
 
 #include <asm/system.h>
diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c
index 78c9b0b6eea7..90ed55b662a8 100644
--- a/arch/alpha/boot/main.c
+++ b/arch/alpha/boot/main.c
@@ -7,7 +7,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/mm.h>
 
 #include <asm/system.h>
diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index 5db3d4eff909..af08ccd4ed6e 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -10,7 +10,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 0a5a3be6d69c..d2b684cd620a 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -47,7 +47,7 @@
  */
 
 #include <asm/segment.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/compile.h>
 #include <asm/boot.h>
 #include <asm/e820.h>
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c
index 1f1771b212b4..9df9f2079e9b 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -24,7 +24,7 @@
 #include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/pci.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/adb.h>
 #include <linux/module.h>
 #include <linux/delay.h>
diff --git a/arch/powerpc/platforms/powermac/bootx_init.c b/arch/powerpc/platforms/powermac/bootx_init.c
index cb257aeb91f6..24f09e2a5775 100644
--- a/arch/powerpc/platforms/powermac/bootx_init.c
+++ b/arch/powerpc/platforms/powermac/bootx_init.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/init.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <asm/sections.h>
 #include <asm/prom.h>
 #include <asm/page.h>
diff --git a/arch/ppc/syslib/btext.c b/arch/ppc/syslib/btext.c
index 51ab6e90fe25..d11667046f21 100644
--- a/arch/ppc/syslib/btext.c
+++ b/arch/ppc/syslib/btext.c
@@ -6,7 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/init.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 
 #include <asm/sections.h>
 #include <asm/bootx.h>
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index 7de8b8fd1685..a50b631f4d2b 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -46,7 +46,7 @@
  */
 
 #include <asm/segment.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/compile.h>
 #include <asm/boot.h>
 #include <asm/e820.h>
diff --git a/drivers/net/wireless/bcm43xx/bcm43xx_ethtool.c b/drivers/net/wireless/bcm43xx/bcm43xx_ethtool.c
index b3ffcf501311..e386dcc32e8c 100644
--- a/drivers/net/wireless/bcm43xx/bcm43xx_ethtool.c
+++ b/drivers/net/wireless/bcm43xx/bcm43xx_ethtool.c
@@ -32,7 +32,7 @@
 #include <linux/netdevice.h>
 #include <linux/pci.h>
 #include <linux/string.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 
 
 static void bcm43xx_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h
index dc7c621e4647..46919f9f5eb3 100644
--- a/include/linux/vermagic.h
+++ b/include/linux/vermagic.h
@@ -1,4 +1,4 @@
-#include <linux/version.h>
+#include <linux/utsrelease.h>
 #include <linux/module.h>
 
 /* Simply sanity version stamp for modules. */
diff --git a/init/version.c b/init/version.c
index 3ddc3ceec2fe..e290802c6bd2 100644
--- a/init/version.c
+++ b/init/version.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
+#include <linux/utsrelease.h>
 #include <linux/version.h>
 
 #define version(a) Version_ ## a
diff --git a/scripts/checkversion.pl b/scripts/checkversion.pl
index 9f84e562318d..ec7d21161bdc 100755
--- a/scripts/checkversion.pl
+++ b/scripts/checkversion.pl
@@ -1,7 +1,7 @@
 #! /usr/bin/perl
 #
-# checkversion find uses of LINUX_VERSION_CODE, KERNEL_VERSION, or
-# UTS_RELEASE without including <linux/version.h>, or cases of
+# checkversion find uses of LINUX_VERSION_CODE or KERNEL_VERSION
+# without including <linux/version.h>, or cases of
 # including <linux/version.h> that don't need it.
 # Copyright (C) 2003, Randy Dunlap <rdunlap@xenotime.net>
 
@@ -41,8 +41,7 @@ foreach $file (@ARGV)
 	}
 
 	# Look for uses: LINUX_VERSION_CODE, KERNEL_VERSION, UTS_RELEASE
-	if (($_ =~ /LINUX_VERSION_CODE/) || ($_ =~ /\WKERNEL_VERSION/) ||
-		($_ =~ /UTS_RELEASE/)) {
+	if (($_ =~ /LINUX_VERSION_CODE/) || ($_ =~ /\WKERNEL_VERSION/)) {
 	    $fUseVersion = 1;
 	    last LINE if $iLinuxVersion;
 	}
-- 
cgit v1.2.3


From dada0769b95602ae791b9c4cd1cbecfc367f00a9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 3 Jul 2006 00:24:05 -0700
Subject: [PATCH] genirq ia64 cleanup

Remove duplicate/redundant/wrong  IRQF_PERCPU definition.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/irq.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 8acb00190d5a..79479e2c6966 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -14,8 +14,6 @@
 #define NR_IRQS		256
 #define NR_IRQ_VECTORS	NR_IRQS
 
-#define IRQF_PERCPU	0x02000000
-
 static __inline__ int
 irq_canonicalize (int irq)
 {
-- 
cgit v1.2.3


From b02454f43578b24bc8b8ab54a239156841f56f6d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 3 Jul 2006 00:24:06 -0700
Subject: [PATCH] lockdep: special s390 print_symbol() version

Have a special version of print_symbol() for s390 which clears the most
significant bit of addr before calling __print_symbol().  This seems to be
better than checking/changing each place in the kernel that saves an
instruction pointer.

Without this the output would look like:

hardirqs last  enabled at (30907): [<80018c6a>] 0x80018c6a
hardirqs last disabled at (30908): [<8001e48c>] 0x8001e48c
softirqs last  enabled at (30904): [<8001dc96>] 0x8001dc96
softirqs last disabled at (30897): [<8001dc50>] 0x8001dc50

instead of this:

hardirqs last  enabled at (19421): [<80018c72>] cpu_idle+0x176/0x1c4
hardirqs last disabled at (19422): [<8001e494>] io_no_vtime+0xa/0x1a
softirqs last  enabled at (19418): [<8001dc9e>] do_softirq+0xa6/0xe8
softirqs last disabled at (19411): [<8001dc58>] do_softirq+0x60/0xe8

Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kallsyms.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 54e2549f96ba..ad71ac053d6e 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -57,10 +57,11 @@ do {						\
 #define print_fn_descriptor_symbol(fmt, addr) print_symbol(fmt, addr)
 #endif
 
-#define print_symbol(fmt, addr)			\
-do {						\
-	__check_printsym_format(fmt, "");	\
-	__print_symbol(fmt, addr);		\
-} while(0)
+static inline void print_symbol(const char *fmt, unsigned long addr)
+{
+	__check_printsym_format(fmt, "");
+	__print_symbol(fmt, (unsigned long)
+		       __builtin_extract_return_addr((void *)addr));
+}
 
 #endif /*_LINUX_KALLSYMS_H*/
-- 
cgit v1.2.3


From c32928c579d88acd43981b59e86900da65f40762 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Mon, 3 Jul 2006 00:24:10 -0700
Subject: [PATCH] PNPACPI: support shareable interrupts

ACPI supplies a "shareable" indication, but PNPACPI ignores it.  If a PNP
device uses a shared interrupt, request_irq() fails because the PNP driver
can't tell whether to supply SA_SHIRQ.

This patch allows PNP drivers to test
    (pnp_irq_flags(dev, 0) & IORESOURCE_IRQ_SHAREABLE)

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Adam Belay <ambx1@neo.rr.com>
Cc: Matthieu Castet <castet.matthieu@free.fr>
Cc: Li Shaohua <shaohua.li@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pnp/pnpacpi/rsparser.c | 11 ++++++++---
 include/linux/ioport.h         |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index 3a4a644c2686..212268881857 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -74,7 +74,7 @@ static void decode_irq_flags(int flag, int *triggering, int *polarity)
 
 static void
 pnpacpi_parse_allocated_irqresource(struct pnp_resource_table *res, u32 gsi,
-	int triggering, int polarity)
+	int triggering, int polarity, int shareable)
 {
 	int i = 0;
 	int irq;
@@ -95,6 +95,9 @@ pnpacpi_parse_allocated_irqresource(struct pnp_resource_table *res, u32 gsi,
 		return;
 	}
 
+	if (shareable)
+		res->irq_resource[i].flags |= IORESOURCE_IRQ_SHAREABLE;
+
 	res->irq_resource[i].start = irq;
 	res->irq_resource[i].end = irq;
 	pcibios_penalize_isa_irq(irq, 1);
@@ -194,7 +197,8 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 			pnpacpi_parse_allocated_irqresource(res_table,
 				res->data.irq.interrupts[i],
 				res->data.irq.triggering,
-				res->data.irq.polarity);
+				res->data.irq.polarity,
+				res->data.irq.sharable);
 		}
 		break;
 
@@ -255,7 +259,8 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 			pnpacpi_parse_allocated_irqresource(res_table,
 				res->data.extended_irq.interrupts[i],
 				res->data.extended_irq.triggering,
-				res->data.extended_irq.polarity);
+				res->data.extended_irq.polarity,
+				res->data.extended_irq.sharable);
 		}
 		break;
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 87a9fc039b47..5612dfeeae50 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -55,6 +55,7 @@ struct resource_list {
 #define IORESOURCE_IRQ_LOWEDGE		(1<<1)
 #define IORESOURCE_IRQ_HIGHLEVEL	(1<<2)
 #define IORESOURCE_IRQ_LOWLEVEL		(1<<3)
+#define IORESOURCE_IRQ_SHAREABLE	(1<<4)
 
 /* ISA PnP DMA specific bits (IORESOURCE_BITS) */
 #define IORESOURCE_DMA_TYPE_MASK	(3<<0)
-- 
cgit v1.2.3


From 9614634fe6a138fd8ae044950700d2af8d203f97 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 3 Jul 2006 00:24:13 -0700
Subject: [PATCH] ZVC/zone_reclaim: Leave 1% of unmapped pagecache pages for
 file I/O

It turns out that it is advantageous to leave a small portion of unmapped file
backed pages if all of a zone's pages (or almost all pages) are allocated and
so the page allocator has to go off-node.

This allows recently used file I/O buffers to stay on the node and
reduces the times that zone reclaim is invoked if file I/O occurs
when we run out of memory in a zone.

The problem is that zone reclaim runs too frequently when the page cache is
used for file I/O (read write and therefore unmapped pages!) alone and we have
almost all pages of the zone allocated.  Zone reclaim may remove 32 unmapped
pages.  File I/O will use these pages for the next read/write requests and the
unmapped pages increase.  After the zone has filled up again zone reclaim will
remove it again after only 32 pages.  This cycle is too inefficient and there
are potentially too many zone reclaim cycles.

With the 1% boundary we may still remove all unmapped pages for file I/O in
zone reclaim pass.  However.  it will take a large number of read and writes
to get back to 1% again where we trigger zone reclaim again.

The zone reclaim 2.6.16/17 does not show this behavior because we have a 30
second timeout.

[akpm@osdl.org: rename the /proc file and the variable]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 14 ++++++++++++++
 include/linux/mmzone.h      |  6 ++++++
 include/linux/swap.h        |  1 +
 include/linux/sysctl.h      |  2 +-
 kernel/sysctl.c             | 11 +++++++++++
 mm/page_alloc.c             | 22 ++++++++++++++++++++++
 mm/vmscan.c                 | 27 ++++++++++++++-------------
 7 files changed, 69 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 86754eb390da..7cee90223d3a 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
 - block_dump
 - drop-caches
 - zone_reclaim_mode
+- min_unmapped_ratio
 - panic_on_oom
 
 ==============================================================
@@ -168,6 +169,19 @@ in all nodes of the system.
 
 =============================================================
 
+min_unmapped_ratio:
+
+This is available only on NUMA kernels.
+
+A percentage of the file backed pages in each zone.  Zone reclaim will only
+occur if more than this percentage of pages are file backed and unmapped.
+This is to insure that a minimal amount of local pages is still available for
+file I/O even if the node is overallocated.
+
+The default is 1 percent.
+
+=============================================================
+
 panic_on_oom
 
 This enables or disables panic on out-of-memory feature.  If this is set to 1,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 27e748eb72b0..656b588a9f96 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -150,6 +150,10 @@ struct zone {
 	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 
 #ifdef CONFIG_NUMA
+	/*
+	 * zone reclaim becomes active if more unmapped pages exist.
+	 */
+	unsigned long		min_unmapped_ratio;
 	struct per_cpu_pageset	*pageset[NR_CPUS];
 #else
 	struct per_cpu_pageset	pageset[NR_CPUS];
@@ -414,6 +418,8 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
+			struct file *, void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index cf6ca6e377bd..5e59184c9096 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -189,6 +189,7 @@ extern long vm_total_pages;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
+extern int sysctl_min_unmapped_ratio;
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 #else
 #define zone_reclaim_mode 0
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 46e4d8f2771f..e4b1a4d4dcf3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -188,7 +188,7 @@ enum
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
-	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
+	VM_MIN_UNMAPPED=32,	/* Set min percent of unmapped pages */
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 };
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99a58f279077..362a0cc37138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -932,6 +932,17 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_MIN_UNMAPPED,
+		.procname	= "min_unmapped_ratio",
+		.data		= &sysctl_min_unmapped_ratio,
+		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
+		.mode		= 0644,
+		.proc_handler	= &sysctl_min_unmapped_ratio_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #endif
 #ifdef CONFIG_X86_32
 	{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e792a583f3b..54a4f5375bba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2005,6 +2005,10 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
+#ifdef CONFIG_NUMA
+		zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
+						/ 100;
+#endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
@@ -2298,6 +2302,24 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA
+int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	for_each_zone(zone)
+		zone->min_unmapped_ratio = (zone->present_pages *
+				sysctl_min_unmapped_ratio) / 100;
+	return 0;
+}
+#endif
+
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ff2ebe9458a3..5d4c4d02254d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1503,10 +1503,6 @@ module_init(kswapd_init)
  *
  * If non-zero call zone_reclaim when the number of free pages falls below
  * the watermarks.
- *
- * In the future we may add flags to the mode. However, the page allocator
- * should only have to check that zone_reclaim_mode != 0 before calling
- * zone_reclaim().
  */
 int zone_reclaim_mode __read_mostly;
 
@@ -1523,6 +1519,12 @@ int zone_reclaim_mode __read_mostly;
  */
 #define ZONE_RECLAIM_PRIORITY 4
 
+/*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -1590,18 +1592,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int node_id;
 
 	/*
-	 * Do not reclaim if there are not enough reclaimable pages in this
-	 * zone that would satify this allocations.
+	 * Zone reclaim reclaims unmapped file backed pages.
 	 *
-	 * All unmapped pagecache pages are reclaimable.
-	 *
-	 * Both counters may be temporarily off a bit so we use
-	 * SWAP_CLUSTER_MAX as the boundary. It may also be good to
-	 * leave a few frequently used unmapped pagecache pages around.
+	 * A small portion of unmapped file backed pages is needed for
+	 * file I/O otherwise pages read by file I/O will be immediately
+	 * thrown out if the zone is overallocated. So we do not reclaim
+	 * if less than a specified percentage of the zone is used by
+	 * unmapped file backed pages.
 	 */
 	if (zone_page_state(zone, NR_FILE_PAGES) -
-		zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX)
-			return 0;
+	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
+		return 0;
 
 	/*
 	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
-- 
cgit v1.2.3


From 4d435f9d8ff01ae726a2a84edb9c2457787a337e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:24 -0700
Subject: [PATCH] lockdep: add is_module_address()

Add is_module_address() method - to be used by lockdep.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/module.h |  6 ++++++
 kernel/module.c        | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 9e9dc7c24d95..d06c74fb8c26 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -358,6 +358,7 @@ static inline int module_is_live(struct module *mod)
 /* Is this address in a module? (second is with no locks, for oops) */
 struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
+int is_module_address(unsigned long addr);
 
 /* Returns module and fills in value, defined and namebuf, or NULL if
    symnum out of range. */
@@ -496,6 +497,11 @@ static inline struct module *__module_text_address(unsigned long addr)
 	return NULL;
 }
 
+static inline int is_module_address(unsigned long addr)
+{
+	return 0;
+}
+
 /* Get/put a kernel symbol (calls should be symmetric) */
 #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak)); &(x); })
 #define symbol_put(x) do { } while(0)
diff --git a/kernel/module.c b/kernel/module.c
index 281172f01e9a..0351625767b1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2159,6 +2159,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 	return e;
 }
 
+/*
+ * Is this a valid module address?
+ */
+int is_module_address(unsigned long addr)
+{
+	unsigned long flags;
+	struct module *mod;
+
+	spin_lock_irqsave(&modlist_lock, flags);
+
+	list_for_each_entry(mod, &modules, list) {
+		if (within(addr, mod->module_core, mod->core_size)) {
+			spin_unlock_irqrestore(&modlist_lock, flags);
+			return 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&modlist_lock, flags);
+
+	return 0;
+}
+
+
 /* Is this a valid kernel address?  We don't grab the lock: we are oopsing. */
 struct module *__module_text_address(unsigned long addr)
 {
-- 
cgit v1.2.3


From 8d8fdf5c76816e5263073008f03f097ffc713db3 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 3 Jul 2006 00:24:25 -0700
Subject: [PATCH] lockdep: add print_ip_sym()

Provide a common print_ip_sym() function that prints the passed instruction
pointer as well as the symbol belonging to it.  Avoids adding a bunch of
#ifdef CONFIG_64BIT in order to get the printk format right on 32/64 bit
platforms.

Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kallsyms.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index ad71ac053d6e..849043ce4ed6 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -64,4 +64,18 @@ static inline void print_symbol(const char *fmt, unsigned long addr)
 		       __builtin_extract_return_addr((void *)addr));
 }
 
+#ifndef CONFIG_64BIT
+#define print_ip_sym(ip)		\
+do {					\
+	printk("[<%08lx>]", ip);	\
+	print_symbol(" %s\n", ip);	\
+} while(0)
+#else
+#define print_ip_sym(ip)		\
+do {					\
+	printk("[<%016lx>]", ip);	\
+	print_symbol(" %s\n", ip);	\
+} while(0)
+#endif
+
 #endif /*_LINUX_KALLSYMS_H*/
-- 
cgit v1.2.3


From a875a69f8b00a38b4f40d9632a4fc71a159f0e0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:26 -0700
Subject: [PATCH] lockdep: add per_cpu_offset()

Add the per_cpu_offset() generic method. (used by the lock validator)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/percpu.h | 2 ++
 include/asm-ia64/percpu.h    | 1 +
 include/asm-powerpc/percpu.h | 1 +
 include/asm-s390/percpu.h    | 1 +
 include/asm-sparc64/percpu.h | 1 +
 include/asm-x86_64/percpu.h  | 2 ++
 6 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index c74521157461..e160e04290fb 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -7,6 +7,8 @@
 
 extern unsigned long __per_cpu_offset[NR_CPUS];
 
+#define per_cpu_offset(x) (__per_cpu_offset[x])
+
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h
index 24d898b650c5..fbe5cf3ab8dc 100644
--- a/include/asm-ia64/percpu.h
+++ b/include/asm-ia64/percpu.h
@@ -36,6 +36,7 @@
 #ifdef CONFIG_SMP
 
 extern unsigned long __per_cpu_offset[NR_CPUS];
+#define per_cpu_offset(x) (__per_cpu_offset(x))
 
 /* Equal to __per_cpu_offset[smp_processor_id()], but faster to access: */
 DECLARE_PER_CPU(unsigned long, local_per_cpu_offset);
diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h
index faa1fc703053..2f2e3024fa61 100644
--- a/include/asm-powerpc/percpu.h
+++ b/include/asm-powerpc/percpu.h
@@ -14,6 +14,7 @@
 
 #define __per_cpu_offset(cpu) (paca[cpu].data_offset)
 #define __my_cpu_offset() get_paca()->data_offset
+#define per_cpu_offset(x) (__per_cpu_offset(x))
 
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h
index d9a8cca9b653..28b3517e787c 100644
--- a/include/asm-s390/percpu.h
+++ b/include/asm-s390/percpu.h
@@ -42,6 +42,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
+#define per_cpu_offset(x) (__per_cpu_offset[x])
 
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h
index a6ece06b83db..ced8cbde046d 100644
--- a/include/asm-sparc64/percpu.h
+++ b/include/asm-sparc64/percpu.h
@@ -11,6 +11,7 @@ extern unsigned long __per_cpu_base;
 extern unsigned long __per_cpu_shift;
 #define __per_cpu_offset(__cpu) \
 	(__per_cpu_base + ((unsigned long)(__cpu) << __per_cpu_shift))
+#define per_cpu_offset(x) (__per_cpu_offset(x))
 
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h
index 549eb929b2c0..08dd9f9dda81 100644
--- a/include/asm-x86_64/percpu.h
+++ b/include/asm-x86_64/percpu.h
@@ -14,6 +14,8 @@
 #define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
 #define __my_cpu_offset() read_pda(data_offset)
 
+#define per_cpu_offset(x) (__per_cpu_offset(x))
+
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
-- 
cgit v1.2.3


From c01d403b2e3e3f231b18ebd07ad64ecbe6a258a5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:27 -0700
Subject: [PATCH] lockdep: add disable/enable_irq_lockdep() API

lockdep wants to use the disable_irq()/enable_irq() prototypes before they are
provied by the platform's asm/irq.h.  So move them out of the
CONFIG_GENERIC_HARDIRQS define - all architectures have a common prototype for
this anyway.

Add special lockdep variants of irq line disabling/enabling.

These should be used for locking constructs that know that a particular irq
context which is disabled, and which is the only irq-context user of a lock,
that it's safe to take the lock in the irq-disabled section without disabling
hardirqs.

[akpm@osdl.org: build fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/interrupt.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index da3e0dbe61d4..2c5452c1d7bb 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -86,6 +86,41 @@ extern void disable_irq_nosync(unsigned int irq);
 extern void disable_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 
+/*
+ * Special lockdep variants of irq disabling/enabling.
+ * These should be used for locking constructs that
+ * know that a particular irq context which is disabled,
+ * and which is the only irq-context user of a lock,
+ * that it's safe to take the lock in the irq-disabled
+ * section without disabling hardirqs.
+ *
+ * On !CONFIG_LOCKDEP they are equivalent to the normal
+ * irq disable/enable methods.
+ */
+static inline void disable_irq_nosync_lockdep(unsigned int irq)
+{
+	disable_irq_nosync(irq);
+#ifdef CONFIG_LOCKDEP
+	local_irq_disable();
+#endif
+}
+
+static inline void disable_irq_lockdep(unsigned int irq)
+{
+	disable_irq(irq);
+#ifdef CONFIG_LOCKDEP
+	local_irq_disable();
+#endif
+}
+
+static inline void enable_irq_lockdep(unsigned int irq)
+{
+#ifdef CONFIG_LOCKDEP
+	local_irq_enable();
+#endif
+	enable_irq(irq);
+}
+
 /* IRQ wakeup (PM) control: */
 extern int set_irq_wake(unsigned int irq, unsigned int on);
 
@@ -99,7 +134,19 @@ static inline int disable_irq_wake(unsigned int irq)
 	return set_irq_wake(irq, 0);
 }
 
-#endif
+#else /* !CONFIG_GENERIC_HARDIRQS */
+/*
+ * NOTE: non-genirq architectures, if they want to support the lock
+ * validator need to define the methods below in their asm/irq.h
+ * files, under an #ifdef CONFIG_LOCKDEP section.
+ */
+# ifndef CONFIG_LOCKDEP
+#  define disable_irq_nosync_lockdep(irq)	disable_irq_nosync(irq)
+#  define disable_irq_lockdep(irq)		disable_irq(irq)
+#  define enable_irq_lockdep(irq)		enable_irq(irq)
+# endif
+
+#endif /* CONFIG_GENERIC_HARDIRQS */
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
-- 
cgit v1.2.3


From d7e9629de051bb4b1d104588cd97673ad770809e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:27 -0700
Subject: [PATCH] lockdep: add local_irq_enable_in_hardirq() API

Introduce local_irq_enable_in_hardirq() API.  It is currently aliased to
local_irq_enable(), hence has no functional effects.

This API will be used by lockdep, but even without lockdep this will better
document places in the kernel where a hardirq context enables hardirqs.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/interrupt.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 2c5452c1d7bb..73463fbb38e4 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -80,6 +80,23 @@ extern int request_irq(unsigned int,
 		       unsigned long, const char *, void *);
 extern void free_irq(unsigned int, void *);
 
+/*
+ * On lockdep we dont want to enable hardirqs in hardirq
+ * context. Use local_irq_enable_in_hardirq() to annotate
+ * kernel code that has to do this nevertheless (pretty much
+ * the only valid case is for old/broken hardware that is
+ * insanely slow).
+ *
+ * NOTE: in theory this might break fragile code that relies
+ * on hardirq delivery - in practice we dont seem to have such
+ * places left. So the only effect should be slightly increased
+ * irqs-off latencies.
+ */
+#ifdef CONFIG_LOCKDEP
+# define local_irq_enable_in_hardirq()	do { } while (0)
+#else
+# define local_irq_enable_in_hardirq()	local_irq_enable()
+#endif
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern void disable_irq_nosync(unsigned int irq);
-- 
cgit v1.2.3


From 8b3db9c542e18b71d4820da4dd9401ee030feacb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:28 -0700
Subject: [PATCH] lockdep: add DECLARE_COMPLETION_ONSTACK() API

lockdep needs to have the waitqueue lock initialized for on-stack waitqueues
implicitly initialized by DECLARE_COMPLETION().  Introduce the API.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/completion.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 90663ad217f9..251c41e3ddd5 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -21,6 +21,18 @@ struct completion {
 #define DECLARE_COMPLETION(work) \
 	struct completion work = COMPLETION_INITIALIZER(work)
 
+/*
+ * Lockdep needs to run a non-constant initializer for on-stack
+ * completions - so we use the _ONSTACK() variant for those that
+ * are on the kernel stack:
+ */
+#ifdef CONFIG_LOCKDEP
+# define DECLARE_COMPLETION_ONSTACK(work) \
+	struct completion work = ({ init_completion(&work); work; })
+#else
+# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
+#endif
+
 static inline void init_completion(struct completion *x)
 {
 	x->done = 0;
-- 
cgit v1.2.3


From c4e05116a2c4d8187127dbf77ab790aa57a47388 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:29 -0700
Subject: [PATCH] lockdep: clean up rwsems

Clean up rwsems.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/rwsem.h       |  17 +------
 include/linux/rwsem-spinlock.h |  14 +-----
 include/linux/rwsem.h          |  24 ----------
 kernel/rwsem.c                 | 105 +++++++++++++++++++++++++++++++++++++++++
 lib/rwsem-spinlock.c           |  46 +-----------------
 lib/rwsem.c                    |  31 ------------
 6 files changed, 109 insertions(+), 128 deletions(-)
 create mode 100644 kernel/rwsem.c

(limited to 'include')

diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h
index be4ab859238e..558804e4a039 100644
--- a/include/asm-i386/rwsem.h
+++ b/include/asm-i386/rwsem.h
@@ -61,23 +61,11 @@ struct rw_semaphore {
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
 };
 
-/*
- * initialisation
- */
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
 #define __RWSEM_INITIALIZER(name) \
 { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
-	__RWSEM_DEBUG_INIT }
+	}
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -87,9 +75,6 @@ static inline void init_rwsem(struct rw_semaphore *sem)
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
 }
 
 /*
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index f30f805080ae..d68afcc36ac9 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -32,22 +32,10 @@ struct rw_semaphore {
 	__s32			activity;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
 };
 
-/*
- * initialisation
- */
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
 #define __RWSEM_INITIALIZER(name) \
-{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
+{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index f99fe90732ab..93581534b915 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -9,8 +9,6 @@
 
 #include <linux/linkage.h>
 
-#define RWSEM_DEBUG 0
-
 #ifdef __KERNEL__
 
 #include <linux/types.h>
@@ -26,23 +24,13 @@ struct rw_semaphore;
 #include <asm/rwsem.h> /* use an arch-specific implementation */
 #endif
 
-#ifndef rwsemtrace
-#if RWSEM_DEBUG
-extern void FASTCALL(rwsemtrace(struct rw_semaphore *sem, const char *str));
-#else
-#define rwsemtrace(SEM,FMT)
-#endif
-#endif
-
 /*
  * lock for reading
  */
 static inline void down_read(struct rw_semaphore *sem)
 {
 	might_sleep();
-	rwsemtrace(sem,"Entering down_read");
 	__down_read(sem);
-	rwsemtrace(sem,"Leaving down_read");
 }
 
 /*
@@ -51,9 +39,7 @@ static inline void down_read(struct rw_semaphore *sem)
 static inline int down_read_trylock(struct rw_semaphore *sem)
 {
 	int ret;
-	rwsemtrace(sem,"Entering down_read_trylock");
 	ret = __down_read_trylock(sem);
-	rwsemtrace(sem,"Leaving down_read_trylock");
 	return ret;
 }
 
@@ -63,9 +49,7 @@ static inline int down_read_trylock(struct rw_semaphore *sem)
 static inline void down_write(struct rw_semaphore *sem)
 {
 	might_sleep();
-	rwsemtrace(sem,"Entering down_write");
 	__down_write(sem);
-	rwsemtrace(sem,"Leaving down_write");
 }
 
 /*
@@ -74,9 +58,7 @@ static inline void down_write(struct rw_semaphore *sem)
 static inline int down_write_trylock(struct rw_semaphore *sem)
 {
 	int ret;
-	rwsemtrace(sem,"Entering down_write_trylock");
 	ret = __down_write_trylock(sem);
-	rwsemtrace(sem,"Leaving down_write_trylock");
 	return ret;
 }
 
@@ -85,9 +67,7 @@ static inline int down_write_trylock(struct rw_semaphore *sem)
  */
 static inline void up_read(struct rw_semaphore *sem)
 {
-	rwsemtrace(sem,"Entering up_read");
 	__up_read(sem);
-	rwsemtrace(sem,"Leaving up_read");
 }
 
 /*
@@ -95,9 +75,7 @@ static inline void up_read(struct rw_semaphore *sem)
  */
 static inline void up_write(struct rw_semaphore *sem)
 {
-	rwsemtrace(sem,"Entering up_write");
 	__up_write(sem);
-	rwsemtrace(sem,"Leaving up_write");
 }
 
 /*
@@ -105,9 +83,7 @@ static inline void up_write(struct rw_semaphore *sem)
  */
 static inline void downgrade_write(struct rw_semaphore *sem)
 {
-	rwsemtrace(sem,"Entering downgrade_write");
 	__downgrade_write(sem);
-	rwsemtrace(sem,"Leaving downgrade_write");
 }
 
 #endif /* __KERNEL__ */
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
new file mode 100644
index 000000000000..790a99bb25aa
--- /dev/null
+++ b/kernel/rwsem.c
@@ -0,0 +1,105 @@
+/* kernel/rwsem.c: R/W semaphores, public implementation
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from asm-i386/semaphore.h
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+/*
+ * lock for reading
+ */
+void down_read(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+	__down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int down_read_trylock(struct rw_semaphore *sem)
+{
+	int ret = __down_read_trylock(sem);
+
+	if (ret == 1)
+		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+
+EXPORT_SYMBOL(down_read_trylock);
+
+/*
+ * lock for writing
+ */
+void down_write(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+	__down_write(sem);
+}
+
+EXPORT_SYMBOL(down_write);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int down_write_trylock(struct rw_semaphore *sem)
+{
+	int ret = __down_write_trylock(sem);
+
+	if (ret == 1)
+		rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+	return ret;
+}
+
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * release a read lock
+ */
+void up_read(struct rw_semaphore *sem)
+{
+	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+	__up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read);
+
+/*
+ * release a write lock
+ */
+void up_write(struct rw_semaphore *sem)
+{
+	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+	__up_write(sem);
+}
+
+EXPORT_SYMBOL(up_write);
+
+/*
+ * downgrade write lock to read lock
+ */
+void downgrade_write(struct rw_semaphore *sem)
+{
+	/*
+	 * lockdep: a downgraded write will live on as a write
+	 * dependency.
+	 */
+	__downgrade_write(sem);
+}
+
+EXPORT_SYMBOL(downgrade_write);
diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 40ffde940a86..03b6097eb04e 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -17,16 +17,6 @@ struct rwsem_waiter {
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
 };
 
-#if RWSEM_DEBUG
-void rwsemtrace(struct rw_semaphore *sem, const char *str)
-{
-	if (sem->debug)
-		printk("[%d] %s({%d,%d})\n",
-		       current->pid, str, sem->activity,
-		       list_empty(&sem->wait_list) ? 0 : 1);
-}
-#endif
-
 /*
  * initialise the semaphore
  */
@@ -35,9 +25,6 @@ void fastcall init_rwsem(struct rw_semaphore *sem)
 	sem->activity = 0;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
 }
 
 /*
@@ -56,8 +43,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	struct task_struct *tsk;
 	int woken;
 
-	rwsemtrace(sem, "Entering __rwsem_do_wake");
-
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 
 	if (!wakewrite) {
@@ -104,7 +89,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	sem->activity += woken;
 
  out:
-	rwsemtrace(sem, "Leaving __rwsem_do_wake");
 	return sem;
 }
 
@@ -138,8 +122,6 @@ void fastcall __sched __down_read(struct rw_semaphore *sem)
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
 
-	rwsemtrace(sem, "Entering __down_read");
-
 	spin_lock_irq(&sem->wait_lock);
 
 	if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
@@ -171,9 +153,8 @@ void fastcall __sched __down_read(struct rw_semaphore *sem)
 	}
 
 	tsk->state = TASK_RUNNING;
-
  out:
-	rwsemtrace(sem, "Leaving __down_read");
+	;
 }
 
 /*
@@ -184,7 +165,6 @@ int fastcall __down_read_trylock(struct rw_semaphore *sem)
 	unsigned long flags;
 	int ret = 0;
 
-	rwsemtrace(sem, "Entering __down_read_trylock");
 
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
@@ -196,7 +176,6 @@ int fastcall __down_read_trylock(struct rw_semaphore *sem)
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
-	rwsemtrace(sem, "Leaving __down_read_trylock");
 	return ret;
 }
 
@@ -209,8 +188,6 @@ void fastcall __sched __down_write(struct rw_semaphore *sem)
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
 
-	rwsemtrace(sem, "Entering __down_write");
-
 	spin_lock_irq(&sem->wait_lock);
 
 	if (sem->activity == 0 && list_empty(&sem->wait_list)) {
@@ -242,9 +219,8 @@ void fastcall __sched __down_write(struct rw_semaphore *sem)
 	}
 
 	tsk->state = TASK_RUNNING;
-
  out:
-	rwsemtrace(sem, "Leaving __down_write");
+	;
 }
 
 /*
@@ -255,8 +231,6 @@ int fastcall __down_write_trylock(struct rw_semaphore *sem)
 	unsigned long flags;
 	int ret = 0;
 
-	rwsemtrace(sem, "Entering __down_write_trylock");
-
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
 	if (sem->activity == 0 && list_empty(&sem->wait_list)) {
@@ -267,7 +241,6 @@ int fastcall __down_write_trylock(struct rw_semaphore *sem)
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
-	rwsemtrace(sem, "Leaving __down_write_trylock");
 	return ret;
 }
 
@@ -278,16 +251,12 @@ void fastcall __up_read(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
-	rwsemtrace(sem, "Entering __up_read");
-
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
 	if (--sem->activity == 0 && !list_empty(&sem->wait_list))
 		sem = __rwsem_wake_one_writer(sem);
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	rwsemtrace(sem, "Leaving __up_read");
 }
 
 /*
@@ -297,8 +266,6 @@ void fastcall __up_write(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
-	rwsemtrace(sem, "Entering __up_write");
-
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
 	sem->activity = 0;
@@ -306,8 +273,6 @@ void fastcall __up_write(struct rw_semaphore *sem)
 		sem = __rwsem_do_wake(sem, 1);
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	rwsemtrace(sem, "Leaving __up_write");
 }
 
 /*
@@ -318,8 +283,6 @@ void fastcall __downgrade_write(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
-	rwsemtrace(sem, "Entering __downgrade_write");
-
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
 	sem->activity = 1;
@@ -327,8 +290,6 @@ void fastcall __downgrade_write(struct rw_semaphore *sem)
 		sem = __rwsem_do_wake(sem, 0);
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	rwsemtrace(sem, "Leaving __downgrade_write");
 }
 
 EXPORT_SYMBOL(init_rwsem);
@@ -339,6 +300,3 @@ EXPORT_SYMBOL(__down_write_trylock);
 EXPORT_SYMBOL(__up_read);
 EXPORT_SYMBOL(__up_write);
 EXPORT_SYMBOL(__downgrade_write);
-#if RWSEM_DEBUG
-EXPORT_SYMBOL(rwsemtrace);
-#endif
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 62fa4eba9ffe..bae597284889 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -16,17 +16,6 @@ struct rwsem_waiter {
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
 };
 
-#if RWSEM_DEBUG
-#undef rwsemtrace
-void rwsemtrace(struct rw_semaphore *sem, const char *str)
-{
-	printk("sem=%p\n", sem);
-	printk("(sem)=%08lx\n", sem->count);
-	if (sem->debug)
-		printk("[%d] %s({%08lx})\n", current->pid, str, sem->count);
-}
-#endif
-
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_xxxx(), then:
@@ -45,8 +34,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	struct list_head *next;
 	signed long oldcount, woken, loop;
 
-	rwsemtrace(sem, "Entering __rwsem_do_wake");
-
 	if (downgrading)
 		goto dont_wake_writers;
 
@@ -127,7 +114,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	next->prev = &sem->wait_list;
 
  out:
-	rwsemtrace(sem, "Leaving __rwsem_do_wake");
 	return sem;
 
 	/* undo the change to count, but check for a transition 1->0 */
@@ -186,13 +172,9 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
-	rwsemtrace(sem, "Entering rwsem_down_read_failed");
-
 	waiter.flags = RWSEM_WAITING_FOR_READ;
 	rwsem_down_failed_common(sem, &waiter,
 				RWSEM_WAITING_BIAS - RWSEM_ACTIVE_BIAS);
-
-	rwsemtrace(sem, "Leaving rwsem_down_read_failed");
 	return sem;
 }
 
@@ -204,12 +186,9 @@ rwsem_down_write_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
-	rwsemtrace(sem, "Entering rwsem_down_write_failed");
-
 	waiter.flags = RWSEM_WAITING_FOR_WRITE;
 	rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_BIAS);
 
-	rwsemtrace(sem, "Leaving rwsem_down_write_failed");
 	return sem;
 }
 
@@ -221,8 +200,6 @@ struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
-	rwsemtrace(sem, "Entering rwsem_wake");
-
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
 	/* do nothing if list empty */
@@ -231,8 +208,6 @@ struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem)
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
-	rwsemtrace(sem, "Leaving rwsem_wake");
-
 	return sem;
 }
 
@@ -245,8 +220,6 @@ struct rw_semaphore fastcall *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
-	rwsemtrace(sem, "Entering rwsem_downgrade_wake");
-
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
 	/* do nothing if list empty */
@@ -255,7 +228,6 @@ struct rw_semaphore fastcall *rwsem_downgrade_wake(struct rw_semaphore *sem)
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
-	rwsemtrace(sem, "Leaving rwsem_downgrade_wake");
 	return sem;
 }
 
@@ -263,6 +235,3 @@ EXPORT_SYMBOL(rwsem_down_read_failed);
 EXPORT_SYMBOL(rwsem_down_write_failed);
 EXPORT_SYMBOL(rwsem_wake);
 EXPORT_SYMBOL(rwsem_downgrade_wake);
-#if RWSEM_DEBUG
-EXPORT_SYMBOL(rwsemtrace);
-#endif
-- 
cgit v1.2.3


From 61f4c3d6db3ecbdd4e1a2a7a1710c1410d085dd1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:29 -0700
Subject: [PATCH] lockdep: remove RWSEM_DEBUG remnants

RWSEM_DEBUG used to be a printk based 'tracing' facility, probably used for
very early prototypes of the rwsem code.  Remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/rwsem.h   | 14 +-------------
 include/asm-ia64/rwsem.h    | 18 +-----------------
 include/asm-powerpc/rwsem.h | 18 +-----------------
 include/asm-sh/rwsem.h      | 18 +-----------------
 include/asm-xtensa/rwsem.h  | 18 +-----------------
 5 files changed, 5 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/asm-alpha/rwsem.h b/include/asm-alpha/rwsem.h
index fafdd4f7010a..1570c0b54336 100644
--- a/include/asm-alpha/rwsem.h
+++ b/include/asm-alpha/rwsem.h
@@ -36,20 +36,11 @@ struct rw_semaphore {
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
 };
 
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
-	LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
+	LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -59,9 +50,6 @@ static inline void init_rwsem(struct rw_semaphore *sem)
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
 }
 
 static inline void __down_read(struct rw_semaphore *sem)
diff --git a/include/asm-ia64/rwsem.h b/include/asm-ia64/rwsem.h
index 1327c91ea39c..2d1640cc240a 100644
--- a/include/asm-ia64/rwsem.h
+++ b/include/asm-ia64/rwsem.h
@@ -33,9 +33,6 @@ struct rw_semaphore {
 	signed long		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
 };
 
 #define RWSEM_UNLOCKED_VALUE		__IA64_UL_CONST(0x0000000000000000)
@@ -45,19 +42,9 @@ struct rw_semaphore {
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-/*
- * initialization
- */
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
-	  LIST_HEAD_INIT((name).wait_list) \
-	  __RWSEM_DEBUG_INIT }
+	  LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -73,9 +60,6 @@ init_rwsem (struct rw_semaphore *sem)
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
 }
 
 /*
diff --git a/include/asm-powerpc/rwsem.h b/include/asm-powerpc/rwsem.h
index 2c2fe9647595..e929145e1e46 100644
--- a/include/asm-powerpc/rwsem.h
+++ b/include/asm-powerpc/rwsem.h
@@ -28,24 +28,11 @@ struct rw_semaphore {
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
 };
 
-/*
- * initialisation
- */
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
-	  LIST_HEAD_INIT((name).wait_list) \
-	  __RWSEM_DEBUG_INIT }
+	  LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name)		\
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -60,9 +47,6 @@ static inline void init_rwsem(struct rw_semaphore *sem)
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
 }
 
 /*
diff --git a/include/asm-sh/rwsem.h b/include/asm-sh/rwsem.h
index 0262d3d1e5e0..9d2aea5e8488 100644
--- a/include/asm-sh/rwsem.h
+++ b/include/asm-sh/rwsem.h
@@ -25,24 +25,11 @@ struct rw_semaphore {
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
 };
 
-/*
- * initialisation
- */
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
-	  LIST_HEAD_INIT((name).wait_list) \
-	  __RWSEM_DEBUG_INIT }
+	  LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name)		\
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -57,9 +44,6 @@ static inline void init_rwsem(struct rw_semaphore *sem)
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
 }
 
 /*
diff --git a/include/asm-xtensa/rwsem.h b/include/asm-xtensa/rwsem.h
index abcd86dc5ab9..0aad3a587551 100644
--- a/include/asm-xtensa/rwsem.h
+++ b/include/asm-xtensa/rwsem.h
@@ -31,24 +31,11 @@ struct rw_semaphore {
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
 };
 
-/*
- * initialisation
- */
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
-	  LIST_HEAD_INIT((name).wait_list) \
-	  __RWSEM_DEBUG_INIT }
+	  LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name)		\
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -63,9 +50,6 @@ static inline void init_rwsem(struct rw_semaphore *sem)
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
 }
 
 /*
-- 
cgit v1.2.3


From 9a11b49a805665e13a56aa067afaf81d43ec1514 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:33 -0700
Subject: [PATCH] lockdep: better lock debugging

Generic lock debugging:

 - generalized lock debugging framework. For example, a bug in one lock
   subsystem turns off debugging in all lock subsystems.

 - got rid of the caller address passing (__IP__/__IP_DECL__/etc.) from
   the mutex/rtmutex debugging code: it caused way too much prototype
   hackery, and lockdep will give the same information anyway.

 - ability to do silent tests

 - check lock freeing in vfree too.

 - more finegrained debugging options, to allow distributions to
   turn off more expensive debugging features.

There's no separate 'held mutexes' list anymore - but there's a 'held locks'
stack within lockdep, which unifies deadlock detection across all lock
classes.  (this is independent of the lockdep validation stuff - lockdep first
checks whether we are holding a lock already)

Here are the current debugging options:

CONFIG_DEBUG_MUTEXES=y
CONFIG_DEBUG_LOCK_ALLOC=y

which do:

 config DEBUG_MUTEXES
          bool "Mutex debugging, basic checks"

 config DEBUG_LOCK_ALLOC
         bool "Detect incorrect freeing of live mutexes"

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/sysrq.c             |   2 +-
 include/asm-generic/mutex-null.h |  15 +-
 include/linux/debug_locks.h      |  69 +++++++++
 include/linux/init_task.h        |   1 -
 include/linux/mm.h               |   8 +-
 include/linux/mutex-debug.h      |  12 +-
 include/linux/mutex.h            |   6 -
 include/linux/rtmutex.h          |  10 --
 include/linux/sched.h            |   4 -
 init/main.c                      |   8 ++
 kernel/exit.c                    |   5 +-
 kernel/fork.c                    |   4 -
 kernel/mutex-debug.c             |  51 +------
 kernel/mutex-debug.h             |  82 ++---------
 kernel/mutex.c                   |  52 +++----
 kernel/mutex.h                   |  17 +--
 kernel/rtmutex-debug.c           | 302 ++-------------------------------------
 kernel/rtmutex-debug.h           |   8 +-
 kernel/rtmutex.c                 |  46 +++---
 kernel/rtmutex.h                 |   3 -
 kernel/sched.c                   |  16 ++-
 lib/Makefile                     |   2 +-
 lib/debug_locks.c                |  45 ++++++
 lib/spinlock_debug.c             |  62 ++++----
 mm/vmalloc.c                     |   2 +
 25 files changed, 265 insertions(+), 567 deletions(-)
 create mode 100644 include/linux/debug_locks.h
 create mode 100644 lib/debug_locks.c

(limited to 'include')

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index a064ee9181c0..e31f079400fa 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -151,7 +151,7 @@ static struct sysrq_key_op sysrq_mountro_op = {
 static void sysrq_handle_showlocks(int key, struct pt_regs *pt_regs,
 				struct tty_struct *tty)
 {
-	mutex_debug_show_all_locks();
+	debug_show_all_locks();
 }
 static struct sysrq_key_op sysrq_showlocks_op = {
 	.handler	= sysrq_handle_showlocks,
diff --git a/include/asm-generic/mutex-null.h b/include/asm-generic/mutex-null.h
index 5cf8b7ce0c45..254a126ede5c 100644
--- a/include/asm-generic/mutex-null.h
+++ b/include/asm-generic/mutex-null.h
@@ -10,15 +10,10 @@
 #ifndef _ASM_GENERIC_MUTEX_NULL_H
 #define _ASM_GENERIC_MUTEX_NULL_H
 
-/* extra parameter only needed for mutex debugging: */
-#ifndef __IP__
-# define __IP__
-#endif
-
-#define __mutex_fastpath_lock(count, fail_fn)	      fail_fn(count __RET_IP__)
-#define __mutex_fastpath_lock_retval(count, fail_fn)  fail_fn(count __RET_IP__)
-#define __mutex_fastpath_unlock(count, fail_fn)       fail_fn(count __RET_IP__)
-#define __mutex_fastpath_trylock(count, fail_fn)      fail_fn(count)
-#define __mutex_slowpath_needs_to_unlock()	      1
+#define __mutex_fastpath_lock(count, fail_fn)		fail_fn(count)
+#define __mutex_fastpath_lock_retval(count, fail_fn)	fail_fn(count)
+#define __mutex_fastpath_unlock(count, fail_fn)		fail_fn(count)
+#define __mutex_fastpath_trylock(count, fail_fn)	fail_fn(count)
+#define __mutex_slowpath_needs_to_unlock()		1
 
 #endif
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
new file mode 100644
index 000000000000..6a7047851e48
--- /dev/null
+++ b/include/linux/debug_locks.h
@@ -0,0 +1,69 @@
+#ifndef __LINUX_DEBUG_LOCKING_H
+#define __LINUX_DEBUG_LOCKING_H
+
+extern int debug_locks;
+extern int debug_locks_silent;
+
+/*
+ * Generic 'turn off all lock debugging' function:
+ */
+extern int debug_locks_off(void);
+
+/*
+ * In the debug case we carry the caller's instruction pointer into
+ * other functions, but we dont want the function argument overhead
+ * in the nondebug case - hence these macros:
+ */
+#define _RET_IP_		(unsigned long)__builtin_return_address(0)
+#define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
+
+#define DEBUG_LOCKS_WARN_ON(c)						\
+({									\
+	int __ret = 0;							\
+									\
+	if (unlikely(c)) {						\
+		if (debug_locks_off())					\
+			WARN_ON(1);					\
+		__ret = 1;						\
+	}								\
+	__ret;								\
+})
+
+#ifdef CONFIG_SMP
+# define SMP_DEBUG_LOCKS_WARN_ON(c)			DEBUG_LOCKS_WARN_ON(c)
+#else
+# define SMP_DEBUG_LOCKS_WARN_ON(c)			do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS
+  extern void locking_selftest(void);
+#else
+# define locking_selftest()	do { } while (0)
+#endif
+
+#ifdef CONFIG_LOCKDEP
+extern void debug_show_all_locks(void);
+extern void debug_show_held_locks(struct task_struct *task);
+extern void debug_check_no_locks_freed(const void *from, unsigned long len);
+extern void debug_check_no_locks_held(struct task_struct *task);
+#else
+static inline void debug_show_all_locks(void)
+{
+}
+
+static inline void debug_show_held_locks(struct task_struct *task)
+{
+}
+
+static inline void
+debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+}
+
+static inline void
+debug_check_no_locks_held(struct task_struct *task)
+{
+}
+#endif
+
+#endif
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3a256957fb56..678c1a90380d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -124,7 +124,6 @@ extern struct group_info init_groups;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
 	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
-	INIT_RT_MUTEXES(tsk)						\
 }
 
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75179529e399..990957e0929f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
 #include <linux/prio_tree.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/debug_locks.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1034,13 +1035,6 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void
-debug_check_no_locks_freed(const void *from, unsigned long len)
-{
-	mutex_debug_check_no_locks_freed(from, len);
-	rt_mutex_debug_check_no_locks_freed(from, len);
-}
-
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
index 8b5769f00467..70a26091fc73 100644
--- a/include/linux/mutex-debug.h
+++ b/include/linux/mutex-debug.h
@@ -7,17 +7,11 @@
  * Mutexes - debugging helpers:
  */
 
-#define __DEBUG_MUTEX_INITIALIZER(lockname) \
-	, .held_list = LIST_HEAD_INIT(lockname.held_list), \
-	  .name = #lockname , .magic = &lockname
+#define __DEBUG_MUTEX_INITIALIZER(lockname)				\
+	, .magic = &lockname
 
-#define mutex_init(sem)		__mutex_init(sem, __FUNCTION__)
+#define mutex_init(sem)		__mutex_init(sem, __FILE__":"#sem)
 
 extern void FASTCALL(mutex_destroy(struct mutex *lock));
 
-extern void mutex_debug_show_all_locks(void);
-extern void mutex_debug_show_held_locks(struct task_struct *filter);
-extern void mutex_debug_check_no_locks_held(struct task_struct *task);
-extern void mutex_debug_check_no_locks_freed(const void *from, unsigned long len);
-
 #endif
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f1ac507fa20d..caafecd5e366 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -50,8 +50,6 @@ struct mutex {
 	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_MUTEXES
 	struct thread_info	*owner;
-	struct list_head	held_list;
-	unsigned long		acquire_ip;
 	const char 		*name;
 	void			*magic;
 #endif
@@ -76,10 +74,6 @@ struct mutex_waiter {
 # define __DEBUG_MUTEX_INITIALIZER(lockname)
 # define mutex_init(mutex)			__mutex_init(mutex, NULL)
 # define mutex_destroy(mutex)				do { } while (0)
-# define mutex_debug_show_all_locks()			do { } while (0)
-# define mutex_debug_show_held_locks(p)			do { } while (0)
-# define mutex_debug_check_no_locks_held(task)		do { } while (0)
-# define mutex_debug_check_no_locks_freed(from, len)	do { } while (0)
 #endif
 
 #define __MUTEX_INITIALIZER(lockname) \
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index fa4a3b82ba70..5d41dee82f80 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -29,8 +29,6 @@ struct rt_mutex {
 	struct task_struct	*owner;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
-	struct list_head	held_list_entry;
-	unsigned long		acquire_ip;
 	const char 		*name, *file;
 	int			line;
 	void			*magic;
@@ -98,14 +96,6 @@ extern int rt_mutex_trylock(struct rt_mutex *lock);
 
 extern void rt_mutex_unlock(struct rt_mutex *lock);
 
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-# define INIT_RT_MUTEX_DEBUG(tsk)					\
-	.held_list_head	= LIST_HEAD_INIT(tsk.held_list_head),		\
-	.held_list_lock	= SPIN_LOCK_UNLOCKED
-#else
-# define INIT_RT_MUTEX_DEBUG(tsk)
-#endif
-
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock),	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index aaf723308ed4..bdabeee10a78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -865,10 +865,6 @@ struct task_struct {
 	struct plist_head pi_waiters;
 	/* Deadlock detection and priority inheritance handling */
 	struct rt_mutex_waiter *pi_blocked_on;
-# ifdef CONFIG_DEBUG_RT_MUTEXES
-	spinlock_t held_list_lock;
-	struct list_head held_list_head;
-# endif
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
diff --git a/init/main.c b/init/main.c
index d604dfef82b8..fc73e1cd8614 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,6 +47,7 @@
 #include <linux/key.h>
 #include <linux/unwind.h>
 #include <linux/buffer_head.h>
+#include <linux/debug_locks.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -511,6 +512,13 @@ asmlinkage void __init start_kernel(void)
 	console_init();
 	if (panic_later)
 		panic(panic_later, panic_param);
+	/*
+	 * Need to run this when irqs are enabled, because it wants
+	 * to self-test [hard/soft]-irqs on/off lock inversion bugs
+	 * too:
+	 */
+	locking_selftest();
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 			initrd_start < min_low_pfn << PAGE_SHIFT) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f7ef2258553..c595db14cf25 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -933,10 +933,9 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
 	/*
-	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
+	 * Make sure we are holding no locks:
 	 */
-	mutex_debug_check_no_locks_held(tsk);
-	rt_mutex_debug_check_no_locks_held(tsk);
+	debug_check_no_locks_held(tsk);
 
 	if (tsk->io_context)
 		exit_io_context();
diff --git a/kernel/fork.c b/kernel/fork.c
index 9064bf9e131b..1cd46a4fb0d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -919,10 +919,6 @@ static inline void rt_mutex_init_task(struct task_struct *p)
 	spin_lock_init(&p->pi_lock);
 	plist_head_init(&p->pi_waiters, &p->pi_lock);
 	p->pi_blocked_on = NULL;
-# ifdef CONFIG_DEBUG_RT_MUTEXES
-	spin_lock_init(&p->held_list_lock);
-	INIT_LIST_HEAD(&p->held_list_head);
-# endif
 #endif
 }
 
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index a92de145ed0d..5569766a1ea2 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -20,52 +20,19 @@
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#include <linux/debug_locks.h>
 
 #include "mutex-debug.h"
 
-/*
- * We need a global lock when we walk through the multi-process
- * lock tree. Only used in the deadlock-debugging case.
- */
-DEFINE_SPINLOCK(debug_mutex_lock);
-
-/*
- * All locks held by all tasks, in a single global list:
- */
-LIST_HEAD(debug_mutex_held_locks);
-
-/*
- * In the debug case we carry the caller's instruction pointer into
- * other functions, but we dont want the function argument overhead
- * in the nondebug case - hence these macros:
- */
-#define __IP_DECL__		, unsigned long ip
-#define __IP__			, ip
-#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
-
-/*
- * "mutex debugging enabled" flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-int debug_mutex_on = 1;
-
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock,
-			   struct thread_info *new_owner __IP_DECL__)
+void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
 {
 	lock->owner = new_owner;
-	DEBUG_LOCKS_WARN_ON(!list_empty(&lock->held_list));
-	if (debug_mutex_on) {
-		list_add_tail(&lock->held_list, &debug_mutex_held_locks);
-		lock->acquire_ip = ip;
-	}
 }
 
-void debug_mutex_init_waiter(struct mutex_waiter *waiter)
+void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
 	waiter->magic = waiter;
@@ -87,9 +54,10 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 }
 
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-			    struct thread_info *ti __IP_DECL__)
+			    struct thread_info *ti)
 {
 	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
 	waiter->lock = lock;
@@ -109,13 +77,10 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 void debug_mutex_unlock(struct mutex *lock)
 {
+	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
-	if (debug_mutex_on) {
-		DEBUG_LOCKS_WARN_ON(list_empty(&lock->held_list));
-		list_del_init(&lock->held_list);
-	}
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name)
@@ -123,10 +88,8 @@ void debug_mutex_init(struct mutex *lock, const char *name)
 	/*
 	 * Make sure we are not reinitializing a held lock:
 	 */
-	mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lock->owner = NULL;
-	INIT_LIST_HEAD(&lock->held_list);
-	lock->name = name;
 	lock->magic = lock;
 }
 
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index bdab13a9ee26..babfbdfc534b 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -10,102 +10,44 @@
  * More details are in kernel/mutex-debug.c.
  */
 
-extern spinlock_t debug_mutex_lock;
-extern struct list_head debug_mutex_held_locks;
-extern int debug_mutex_on;
-
-/*
- * In the debug case we carry the caller's instruction pointer into
- * other functions, but we dont want the function argument overhead
- * in the nondebug case - hence these macros:
- */
-#define __IP_DECL__		, unsigned long ip
-#define __IP__			, ip
-#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
-
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void debug_mutex_set_owner(struct mutex *lock,
-				  struct thread_info *new_owner __IP_DECL__);
+extern void
+debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
 
 static inline void debug_mutex_clear_owner(struct mutex *lock)
 {
 	lock->owner = NULL;
 }
 
-extern void debug_mutex_init_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_lock_common(struct mutex *lock,
+				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
 extern void debug_mutex_add_waiter(struct mutex *lock,
 				   struct mutex_waiter *waiter,
-				   struct thread_info *ti __IP_DECL__);
+				   struct thread_info *ti);
 extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 				struct thread_info *ti);
 extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name);
-
-#define debug_spin_lock_save(lock, flags)		\
-	do {						\
-		local_irq_save(flags);			\
-		if (debug_mutex_on)			\
-			spin_lock(lock);		\
-	} while (0)
-
-#define debug_spin_unlock_restore(lock, flags)		\
-	do {						\
-		if (debug_mutex_on)			\
-			spin_unlock(lock);		\
-		local_irq_restore(flags);		\
-		preempt_check_resched();		\
-	} while (0)
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+			     struct lock_class_key *key);
 
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
 							\
 		DEBUG_LOCKS_WARN_ON(in_interrupt());	\
-		debug_spin_lock_save(&debug_mutex_lock, flags); \
-		spin_lock(lock);			\
+		local_irq_save(flags);			\
+		__raw_spin_lock(&(lock)->raw_lock);	\
 		DEBUG_LOCKS_WARN_ON(l->magic != l);	\
 	} while (0)
 
 #define spin_unlock_mutex(lock, flags)			\
 	do {						\
-		spin_unlock(lock);			\
-		debug_spin_unlock_restore(&debug_mutex_lock, flags);	\
+		__raw_spin_unlock(&(lock)->raw_lock);	\
+		local_irq_restore(flags);		\
+		preempt_check_resched();		\
 	} while (0)
-
-#define DEBUG_OFF()					\
-do {							\
-	if (debug_mutex_on) {				\
-		debug_mutex_on = 0;			\
-		console_verbose();			\
-		if (spin_is_locked(&debug_mutex_lock))	\
-			spin_unlock(&debug_mutex_lock);	\
-	}						\
-} while (0)
-
-#define DEBUG_BUG()					\
-do {							\
-	if (debug_mutex_on) {				\
-		DEBUG_OFF();				\
-		BUG();					\
-	}						\
-} while (0)
-
-#define DEBUG_LOCKS_WARN_ON(c)				\
-do {							\
-	if (unlikely(c && debug_mutex_on)) {		\
-		DEBUG_OFF();				\
-		WARN_ON(1);				\
-	}						\
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_DEBUG_LOCKS_WARN_ON(c)			DEBUG_LOCKS_WARN_ON(c)
-#else
-# define SMP_DEBUG_LOCKS_WARN_ON(c)			do { } while (0)
-#endif
-
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 101ceeb38925..3aad0b7992f4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/debug_locks.h>
 
 /*
  * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -38,7 +39,7 @@
  *
  * It is not allowed to initialize an already locked mutex.
  */
-void fastcall __mutex_init(struct mutex *lock, const char *name)
+__always_inline void fastcall __mutex_init(struct mutex *lock, const char *name)
 {
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
@@ -56,7 +57,7 @@ EXPORT_SYMBOL(__mutex_init);
  * branch is predicted by the CPU as default-untaken.
  */
 static void fastcall noinline __sched
-__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_lock_slowpath(atomic_t *lock_count);
 
 /***
  * mutex_lock - acquire the mutex
@@ -79,7 +80,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
  *
  * This function is similar to (but not equivalent to) down().
  */
-void fastcall __sched mutex_lock(struct mutex *lock)
+void inline fastcall __sched mutex_lock(struct mutex *lock)
 {
 	might_sleep();
 	/*
@@ -92,7 +93,7 @@ void fastcall __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 
 static void fastcall noinline __sched
-__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_unlock_slowpath(atomic_t *lock_count);
 
 /***
  * mutex_unlock - release the mutex
@@ -120,18 +121,17 @@ EXPORT_SYMBOL(mutex_unlock);
  * Lock a mutex (possibly interruptible), slowpath:
  */
 static inline int __sched
-__mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
 	unsigned int old_val;
 	unsigned long flags;
 
-	debug_mutex_init_waiter(&waiter);
-
 	spin_lock_mutex(&lock->wait_lock, flags);
 
-	debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
+	debug_mutex_lock_common(lock, &waiter);
+	debug_mutex_add_waiter(lock, &waiter, task->thread_info);
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
@@ -173,7 +173,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
 
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task->thread_info);
-	debug_mutex_set_owner(lock, task->thread_info __IP__);
+	debug_mutex_set_owner(lock, task->thread_info);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -183,32 +183,28 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
 
 	debug_mutex_free_waiter(&waiter);
 
-	DEBUG_LOCKS_WARN_ON(list_empty(&lock->held_list));
-	DEBUG_LOCKS_WARN_ON(lock->owner != task->thread_info);
-
 	return 0;
 }
 
 static void fastcall noinline __sched
-__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
+__mutex_lock_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
 
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__);
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
 }
 
 /*
  * Release the lock, slowpath:
  */
-static fastcall noinline void
-__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
+static fastcall inline void
+__mutex_unlock_common_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
 	unsigned long flags;
 
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
-
 	spin_lock_mutex(&lock->wait_lock, flags);
+	debug_mutex_unlock(lock);
 
 	/*
 	 * some architectures leave the lock unlocked in the fastpath failure
@@ -218,8 +214,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
 	if (__mutex_slowpath_needs_to_unlock())
 		atomic_set(&lock->count, 1);
 
-	debug_mutex_unlock(lock);
-
 	if (!list_empty(&lock->wait_list)) {
 		/* get the first entry from the wait-list: */
 		struct mutex_waiter *waiter =
@@ -236,12 +230,21 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
+/*
+ * Release the lock, slowpath:
+ */
+static fastcall noinline void
+__mutex_unlock_slowpath(atomic_t *lock_count)
+{
+	__mutex_unlock_common_slowpath(lock_count);
+}
+
 /*
  * Here come the less common (and hence less performance-critical) APIs:
  * mutex_lock_interruptible() and mutex_trylock().
  */
 static int fastcall noinline __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
 
 /***
  * mutex_lock_interruptible - acquire the mutex, interruptable
@@ -264,11 +267,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 static int fastcall noinline __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
 
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
 }
 
 /*
@@ -285,7 +288,8 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1))
-		debug_mutex_set_owner(lock, current_thread_info() __RET_IP__);
+		debug_mutex_set_owner(lock, current_thread_info());
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 7e1ed48d1a6c..aeb2d916aa0e 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,22 +16,15 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define DEBUG_LOCKS_WARN_ON(c)				do { } while (0)
 #define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
 #define debug_mutex_clear_owner(lock)			do { } while (0)
-#define debug_mutex_init_waiter(waiter)			do { } while (0)
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti, ip)	do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
 #define debug_mutex_unlock(lock)			do { } while (0)
 #define debug_mutex_init(lock, name)			do { } while (0)
 
-/*
- * Return-address parameters/declarations. They are very useful for
- * debugging, but add overhead in the !DEBUG case - so we go the
- * trouble of using this not too elegant but zero-cost solution:
- */
-#define __IP_DECL__
-#define __IP__
-#define __RET_IP__
-
+static inline void
+debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+}
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 4aa8a2c9f453..353a853bc390 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/plist.h>
 #include <linux/fs.h>
+#include <linux/debug_locks.h>
 
 #include "rtmutex_common.h"
 
@@ -45,8 +46,6 @@ do {								\
 		console_verbose();				\
 		if (spin_is_locked(&current->pi_lock))		\
 			spin_unlock(&current->pi_lock);		\
-		if (spin_is_locked(&current->held_list_lock))	\
-			spin_unlock(&current->held_list_lock);	\
 	}							\
 } while (0)
 
@@ -105,14 +104,6 @@ static void printk_task(task_t *p)
 		printk("<none>");
 }
 
-static void printk_task_short(task_t *p)
-{
-	if (p)
-		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
-	else
-		printk("<none>");
-}
-
 static void printk_lock(struct rt_mutex *lock, int print_owner)
 {
 	if (lock->name)
@@ -128,222 +119,6 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 		printk_task(rt_mutex_owner(lock));
 		printk("\n");
 	}
-	if (rt_mutex_owner(lock)) {
-		printk("... acquired at:               ");
-		print_symbol("%s\n", lock->acquire_ip);
-	}
-}
-
-static void printk_waiter(struct rt_mutex_waiter *w)
-{
-	printk("-------------------------\n");
-	printk("| waiter struct %p:\n", w);
-	printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
-	       w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
-	       w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
-	       w->list_entry.prio);
-	printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
-	       w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
-	       w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
-	       w->pi_list_entry.prio);
-	printk("\n| lock:\n");
-	printk_lock(w->lock, 1);
-	printk("| w->ti->task:\n");
-	printk_task(w->task);
-	printk("| blocked at:  ");
-	print_symbol("%s\n", w->ip);
-	printk("-------------------------\n");
-}
-
-static void show_task_locks(task_t *p)
-{
-	switch (p->state) {
-	case TASK_RUNNING:		printk("R"); break;
-	case TASK_INTERRUPTIBLE:	printk("S"); break;
-	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
-	case TASK_STOPPED:		printk("T"); break;
-	case EXIT_ZOMBIE:		printk("Z"); break;
-	case EXIT_DEAD:			printk("X"); break;
-	default:			printk("?"); break;
-	}
-	printk_task(p);
-	if (p->pi_blocked_on) {
-		struct rt_mutex *lock = p->pi_blocked_on->lock;
-
-		printk(" blocked on:");
-		printk_lock(lock, 1);
-	} else
-		printk(" (not blocked)\n");
-}
-
-void rt_mutex_show_held_locks(task_t *task, int verbose)
-{
-	struct list_head *curr, *cursor = NULL;
-	struct rt_mutex *lock;
-	task_t *t;
-	unsigned long flags;
-	int count = 0;
-
-	if (!rt_trace_on)
-		return;
-
-	if (verbose) {
-		printk("------------------------------\n");
-		printk("| showing all locks held by: |  (");
-		printk_task_short(task);
-		printk("):\n");
-		printk("------------------------------\n");
-	}
-
-next:
-	spin_lock_irqsave(&task->held_list_lock, flags);
-	list_for_each(curr, &task->held_list_head) {
-		if (cursor && curr != cursor)
-			continue;
-		lock = list_entry(curr, struct rt_mutex, held_list_entry);
-		t = rt_mutex_owner(lock);
-		WARN_ON(t != task);
-		count++;
-		cursor = curr->next;
-		spin_unlock_irqrestore(&task->held_list_lock, flags);
-
-		printk("\n#%03d:            ", count);
-		printk_lock(lock, 0);
-		goto next;
-	}
-	spin_unlock_irqrestore(&task->held_list_lock, flags);
-
-	printk("\n");
-}
-
-void rt_mutex_show_all_locks(void)
-{
-	task_t *g, *p;
-	int count = 10;
-	int unlock = 1;
-
-	printk("\n");
-	printk("----------------------\n");
-	printk("| showing all tasks: |\n");
-	printk("----------------------\n");
-
-	/*
-	 * Here we try to get the tasklist_lock as hard as possible,
-	 * if not successful after 2 seconds we ignore it (but keep
-	 * trying). This is to enable a debug printout even if a
-	 * tasklist_lock-holding task deadlocks or crashes.
-	 */
-retry:
-	if (!read_trylock(&tasklist_lock)) {
-		if (count == 10)
-			printk("hm, tasklist_lock locked, retrying... ");
-		if (count) {
-			count--;
-			printk(" #%d", 10-count);
-			mdelay(200);
-			goto retry;
-		}
-		printk(" ignoring it.\n");
-		unlock = 0;
-	}
-	if (count != 10)
-		printk(" locked it.\n");
-
-	do_each_thread(g, p) {
-		show_task_locks(p);
-		if (!unlock)
-			if (read_trylock(&tasklist_lock))
-				unlock = 1;
-	} while_each_thread(g, p);
-
-	printk("\n");
-
-	printk("-----------------------------------------\n");
-	printk("| showing all locks held in the system: |\n");
-	printk("-----------------------------------------\n");
-
-	do_each_thread(g, p) {
-		rt_mutex_show_held_locks(p, 0);
-		if (!unlock)
-			if (read_trylock(&tasklist_lock))
-				unlock = 1;
-	} while_each_thread(g, p);
-
-
-	printk("=============================================\n\n");
-
-	if (unlock)
-		read_unlock(&tasklist_lock);
-}
-
-void rt_mutex_debug_check_no_locks_held(task_t *task)
-{
-	struct rt_mutex_waiter *w;
-	struct list_head *curr;
-	struct rt_mutex *lock;
-
-	if (!rt_trace_on)
-		return;
-	if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
-		printk("BUG: PI priority boost leaked!\n");
-		printk_task(task);
-		printk("\n");
-	}
-	if (list_empty(&task->held_list_head))
-		return;
-
-	spin_lock(&task->pi_lock);
-	plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
-		TRACE_OFF();
-
-		printk("hm, PI interest held at exit time? Task:\n");
-		printk_task(task);
-		printk_waiter(w);
-		return;
-	}
-	spin_unlock(&task->pi_lock);
-
-	list_for_each(curr, &task->held_list_head) {
-		lock = list_entry(curr, struct rt_mutex, held_list_entry);
-
-		printk("BUG: %s/%d, lock held at task exit time!\n",
-		       task->comm, task->pid);
-		printk_lock(lock, 1);
-		if (rt_mutex_owner(lock) != task)
-			printk("exiting task is not even the owner??\n");
-	}
-}
-
-int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
-{
-	const void *to = from + len;
-	struct list_head *curr;
-	struct rt_mutex *lock;
-	unsigned long flags;
-	void *lock_addr;
-
-	if (!rt_trace_on)
-		return 0;
-
-	spin_lock_irqsave(&current->held_list_lock, flags);
-	list_for_each(curr, &current->held_list_head) {
-		lock = list_entry(curr, struct rt_mutex, held_list_entry);
-		lock_addr = lock;
-		if (lock_addr < from || lock_addr >= to)
-			continue;
-		TRACE_OFF();
-
-		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
-			current->comm, current->pid, lock, from, to);
-		dump_stack();
-		printk_lock(lock, 1);
-		if (rt_mutex_owner(lock) != current)
-			printk("freeing task is not even the owner??\n");
-		return 1;
-	}
-	spin_unlock_irqrestore(&current->held_list_lock, flags);
-
-	return 0;
 }
 
 void rt_mutex_debug_task_free(struct task_struct *task)
@@ -395,85 +170,41 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 	       current->comm, current->pid);
 	printk_lock(waiter->lock, 1);
 
-	printk("... trying at:                 ");
-	print_symbol("%s\n", waiter->ip);
-
 	printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
 	printk_lock(waiter->deadlock_lock, 1);
 
-	rt_mutex_show_held_locks(current, 1);
-	rt_mutex_show_held_locks(task, 1);
+	debug_show_held_locks(current);
+	debug_show_held_locks(task);
 
 	printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
 	show_stack(task, NULL);
 	printk("\n%s/%d's [current] stackdump:\n\n",
 	       current->comm, current->pid);
 	dump_stack();
-	rt_mutex_show_all_locks();
+	debug_show_all_locks();
+
 	printk("[ turning off deadlock detection."
 	       "Please report this trace. ]\n\n");
 	local_irq_disable();
 }
 
-void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+void debug_rt_mutex_lock(struct rt_mutex *lock)
 {
-	unsigned long flags;
-
-	if (rt_trace_on) {
-		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
-
-		spin_lock_irqsave(&current->held_list_lock, flags);
-		list_add_tail(&lock->held_list_entry, &current->held_list_head);
-		spin_unlock_irqrestore(&current->held_list_lock, flags);
-
-		lock->acquire_ip = ip;
-	}
 }
 
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-	unsigned long flags;
-
-	if (rt_trace_on) {
-		TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
-		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
-
-		spin_lock_irqsave(&current->held_list_lock, flags);
-		list_del_init(&lock->held_list_entry);
-		spin_unlock_irqrestore(&current->held_list_lock, flags);
-	}
+	TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
 }
 
-void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
-			       struct task_struct *powner __IP_DECL__)
+void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
 {
-	unsigned long flags;
-
-	if (rt_trace_on) {
-		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
-
-		spin_lock_irqsave(&powner->held_list_lock, flags);
-		list_add_tail(&lock->held_list_entry, &powner->held_list_head);
-		spin_unlock_irqrestore(&powner->held_list_lock, flags);
-
-		lock->acquire_ip = ip;
-	}
 }
 
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-	unsigned long flags;
-
-	if (rt_trace_on) {
-		struct task_struct *owner = rt_mutex_owner(lock);
-
-		TRACE_WARN_ON_LOCKED(!owner);
-		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
-
-		spin_lock_irqsave(&owner->held_list_lock, flags);
-		list_del_init(&lock->held_list_entry);
-		spin_unlock_irqrestore(&owner->held_list_lock, flags);
-	}
+	TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
 }
 
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -493,14 +224,11 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 
 void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
-	void *addr = lock;
-
-	if (rt_trace_on) {
-		rt_mutex_debug_check_no_locks_freed(addr,
-						    sizeof(struct rt_mutex));
-		INIT_LIST_HEAD(&lock->held_list_entry);
-		lock->name = name;
-	}
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lock->name = name;
 }
 
 void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
index 7612fbc62d70..14193d596d78 100644
--- a/kernel/rtmutex-debug.h
+++ b/kernel/rtmutex-debug.h
@@ -9,20 +9,16 @@
  * This file contains macros used solely by rtmutex.c. Debug version.
  */
 
-#define __IP_DECL__		, unsigned long ip
-#define __IP__			, ip
-#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
-
 extern void
 rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
 extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
 extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
-extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock);
 extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
 extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
-				      struct task_struct *powner __IP_DECL__);
+				      struct task_struct *powner);
 extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
 extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
 				    struct rt_mutex *lock);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 45d61016da57..91b699aa658b 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -161,8 +161,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task,
 				      int deadlock_detect,
 				      struct rt_mutex *orig_lock,
 				      struct rt_mutex_waiter *orig_waiter,
-				      struct task_struct *top_task
-				      __IP_DECL__)
+				      struct task_struct *top_task)
 {
 	struct rt_mutex *lock;
 	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -357,7 +356,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
  *
  * Must be called with lock->wait_lock held.
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+static int try_to_take_rt_mutex(struct rt_mutex *lock)
 {
 	/*
 	 * We have to be careful here if the atomic speedups are
@@ -384,7 +383,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
 		return 0;
 
 	/* We got the lock. */
-	debug_rt_mutex_lock(lock __IP__);
+	debug_rt_mutex_lock(lock);
 
 	rt_mutex_set_owner(lock, current, 0);
 
@@ -402,8 +401,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
-				   int detect_deadlock
-				   __IP_DECL__)
+				   int detect_deadlock)
 {
 	struct rt_mutex_waiter *top_waiter = waiter;
 	task_t *owner = rt_mutex_owner(lock);
@@ -454,7 +452,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	spin_unlock(&lock->wait_lock);
 
 	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-					 current __IP__);
+					 current);
 
 	spin_lock(&lock->wait_lock);
 
@@ -526,7 +524,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
  * Must be called with lock->wait_lock held
  */
 static void remove_waiter(struct rt_mutex *lock,
-			  struct rt_mutex_waiter *waiter  __IP_DECL__)
+			  struct rt_mutex_waiter *waiter)
 {
 	int first = (waiter == rt_mutex_top_waiter(lock));
 	int boost = 0;
@@ -568,7 +566,7 @@ static void remove_waiter(struct rt_mutex *lock,
 
 	spin_unlock(&lock->wait_lock);
 
-	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
+	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
 
 	spin_lock(&lock->wait_lock);
 }
@@ -595,7 +593,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	get_task_struct(task);
 	spin_unlock_irqrestore(&task->pi_lock, flags);
 
-	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
+	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
 /*
@@ -604,7 +602,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 static int __sched
 rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		  struct hrtimer_sleeper *timeout,
-		  int detect_deadlock __IP_DECL__)
+		  int detect_deadlock)
 {
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
@@ -615,7 +613,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	spin_lock(&lock->wait_lock);
 
 	/* Try to acquire the lock again: */
-	if (try_to_take_rt_mutex(lock __IP__)) {
+	if (try_to_take_rt_mutex(lock)) {
 		spin_unlock(&lock->wait_lock);
 		return 0;
 	}
@@ -629,7 +627,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 	for (;;) {
 		/* Try to acquire the lock: */
-		if (try_to_take_rt_mutex(lock __IP__))
+		if (try_to_take_rt_mutex(lock))
 			break;
 
 		/*
@@ -653,7 +651,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		 */
 		if (!waiter.task) {
 			ret = task_blocks_on_rt_mutex(lock, &waiter,
-						      detect_deadlock __IP__);
+						      detect_deadlock);
 			/*
 			 * If we got woken up by the owner then start loop
 			 * all over without going into schedule to try
@@ -680,7 +678,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	set_current_state(TASK_RUNNING);
 
 	if (unlikely(waiter.task))
-		remove_waiter(lock, &waiter __IP__);
+		remove_waiter(lock, &waiter);
 
 	/*
 	 * try_to_take_rt_mutex() sets the waiter bit
@@ -711,7 +709,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
  * Slow path try-lock function:
  */
 static inline int
-rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
 	int ret = 0;
 
@@ -719,7 +717,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
 
 	if (likely(rt_mutex_owner(lock) != current)) {
 
-		ret = try_to_take_rt_mutex(lock __IP__);
+		ret = try_to_take_rt_mutex(lock);
 		/*
 		 * try_to_take_rt_mutex() sets the lock waiters
 		 * bit unconditionally. Clean this up.
@@ -769,13 +767,13 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
 		  int detect_deadlock,
 		  int (*slowfn)(struct rt_mutex *lock, int state,
 				struct hrtimer_sleeper *timeout,
-				int detect_deadlock __IP_DECL__))
+				int detect_deadlock))
 {
 	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
-		return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+		return slowfn(lock, state, NULL, detect_deadlock);
 }
 
 static inline int
@@ -783,24 +781,24 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
 			struct hrtimer_sleeper *timeout, int detect_deadlock,
 			int (*slowfn)(struct rt_mutex *lock, int state,
 				      struct hrtimer_sleeper *timeout,
-				      int detect_deadlock __IP_DECL__))
+				      int detect_deadlock))
 {
 	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
-		return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+		return slowfn(lock, state, timeout, detect_deadlock);
 }
 
 static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
-		     int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+		     int (*slowfn)(struct rt_mutex *lock))
 {
 	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 1;
 	}
-	return slowfn(lock __RET_IP__);
+	return slowfn(lock);
 }
 
 static inline void
@@ -948,7 +946,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				struct task_struct *proxy_owner)
 {
 	__rt_mutex_init(lock, NULL);
-	debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
+	debug_rt_mutex_proxy_lock(lock, proxy_owner);
 	rt_mutex_set_owner(lock, proxy_owner, 0);
 	rt_mutex_deadlock_account_lock(lock, proxy_owner);
 }
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
index 1e0fca13ff72..a1a1dd06421d 100644
--- a/kernel/rtmutex.h
+++ b/kernel/rtmutex.h
@@ -10,9 +10,6 @@
  * Non-debug version.
  */
 
-#define __IP_DECL__
-#define __IP__
-#define __RET_IP__
 #define rt_mutex_deadlock_check(l)			(0)
 #define rt_mutex_deadlock_account_lock(m, t)		do { } while (0)
 #define rt_mutex_deadlock_account_unlock(l)		do { } while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index d5e37072ea54..48c1faa60a67 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -30,6 +30,7 @@
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -3142,12 +3143,13 @@ void fastcall add_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-	BUG_ON((preempt_count() < 0));
+	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+		return;
 	preempt_count() += val;
 	/*
 	 * Spinlock count overflowing soon?
 	 */
-	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 
@@ -3156,11 +3158,15 @@ void fastcall sub_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-	BUG_ON(val > preempt_count());
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
-	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+			!(preempt_count() & PREEMPT_MASK)))
+		return;
+
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
@@ -4690,7 +4696,7 @@ void show_state(void)
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
-	mutex_debug_show_all_locks();
+	debug_show_all_locks();
 }
 
 /**
diff --git a/lib/Makefile b/lib/Makefile
index 10c13c9d7824..4f5d01922f82 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 
-obj-y += sort.o parser.o halfmd4.o iomap_copy.o
+obj-y += sort.o parser.o halfmd4.o iomap_copy.o debug_locks.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/debug_locks.c b/lib/debug_locks.c
new file mode 100644
index 000000000000..0ef01d14727c
--- /dev/null
+++ b/lib/debug_locks.c
@@ -0,0 +1,45 @@
+/*
+ * lib/debug_locks.c
+ *
+ * Generic place for common debugging facilities for various locks:
+ * spinlocks, rwlocks, mutexes and rwsems.
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/rwsem.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/debug_locks.h>
+
+/*
+ * We want to turn all lock-debugging facilities on/off at once,
+ * via a global flag. The reason is that once a single bug has been
+ * detected and reported, there might be cascade of followup bugs
+ * that would just muddy the log. So we report the first one and
+ * shut up after that.
+ */
+int debug_locks = 1;
+
+/*
+ * The locking-testsuite uses <debug_locks_silent> to get a
+ * 'silent failure': nothing is printed to the console when
+ * a locking bug is detected.
+ */
+int debug_locks_silent;
+
+/*
+ * Generic 'turn off all lock debugging' function:
+ */
+int debug_locks_off(void)
+{
+	if (xchg(&debug_locks, 0)) {
+		if (!debug_locks_silent) {
+			console_verbose();
+			return 1;
+		}
+	}
+	return 0;
+}
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index 93c15ee3f8ea..3de2ccf48ac6 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -8,38 +8,35 @@
 
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/debug_locks.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 
 static void spin_bug(spinlock_t *lock, const char *msg)
 {
-	static long print_once = 1;
 	struct task_struct *owner = NULL;
 
-	if (xchg(&print_once, 0)) {
-		if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
-			owner = lock->owner;
-		printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
-			msg, raw_smp_processor_id(),
-			current->comm, current->pid);
-		printk(KERN_EMERG " lock: %p, .magic: %08x, .owner: %s/%d, "
-				".owner_cpu: %d\n",
-			lock, lock->magic,
-			owner ? owner->comm : "<none>",
-			owner ? owner->pid : -1,
-			lock->owner_cpu);
-		dump_stack();
-#ifdef CONFIG_SMP
-		/*
-		 * We cannot continue on SMP:
-		 */
-//		panic("bad locking");
-#endif
-	}
+	if (!debug_locks_off())
+		return;
+
+	if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
+		owner = lock->owner;
+	printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
+		msg, raw_smp_processor_id(),
+		current->comm, current->pid);
+	printk(KERN_EMERG " lock: %p, .magic: %08x, .owner: %s/%d, "
+			".owner_cpu: %d\n",
+		lock, lock->magic,
+		owner ? owner->comm : "<none>",
+		owner ? owner->pid : -1,
+		lock->owner_cpu);
+	dump_stack();
 }
 
 #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
 
-static inline void debug_spin_lock_before(spinlock_t *lock)
+static inline void
+debug_spin_lock_before(spinlock_t *lock)
 {
 	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
 	SPIN_BUG_ON(lock->owner == current, lock, "recursion");
@@ -118,20 +115,13 @@ void _raw_spin_unlock(spinlock_t *lock)
 
 static void rwlock_bug(rwlock_t *lock, const char *msg)
 {
-	static long print_once = 1;
-
-	if (xchg(&print_once, 0)) {
-		printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
-			msg, raw_smp_processor_id(), current->comm,
-			current->pid, lock);
-		dump_stack();
-#ifdef CONFIG_SMP
-		/*
-		 * We cannot continue on SMP:
-		 */
-		panic("bad locking");
-#endif
-	}
+	if (!debug_locks_off())
+		return;
+
+	printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
+		msg, raw_smp_processor_id(), current->comm,
+		current->pid, lock);
+	dump_stack();
 }
 
 #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f8553f893a..7b450798b458 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -330,6 +330,8 @@ void __vunmap(void *addr, int deallocate_pages)
 		return;
 	}
 
+	debug_check_no_locks_freed(addr, area->size);
+
 	if (deallocate_pages) {
 		int i;
 
-- 
cgit v1.2.3


From e4d919188554a77c798a267e098059bc9aa39726 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:34 -0700
Subject: [PATCH] lockdep: locking init debugging improvement

Locking init improvement:

 - introduce and use __SPIN_LOCK_UNLOCKED for array initializations,
   to pass in the name string of locks, used by debugging

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/random.c          |  6 +++---
 fs/dcache.c                    |  2 +-
 include/linux/idr.h            |  2 +-
 include/linux/init_task.h      | 10 +++++-----
 include/linux/notifier.h       |  2 +-
 include/linux/seqlock.h        | 12 ++++++++++--
 include/linux/spinlock_types.h | 15 +++++++++------
 include/linux/wait.h           |  2 +-
 kernel/rcupdate.c              |  4 ++--
 kernel/timer.c                 |  2 +-
 mm/swap_state.c                |  2 +-
 net/ipv4/tcp_ipv4.c            |  2 +-
 net/ipv4/tcp_minisocks.c       |  2 +-
 13 files changed, 37 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 164bddae047f..4c3a5ca9d8f7 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -416,7 +416,7 @@ static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
 	.name = "input",
 	.limit = 1,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = __SPIN_LOCK_UNLOCKED(&input_pool.lock),
 	.pool = input_pool_data
 };
 
@@ -425,7 +425,7 @@ static struct entropy_store blocking_pool = {
 	.name = "blocking",
 	.limit = 1,
 	.pull = &input_pool,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = __SPIN_LOCK_UNLOCKED(&blocking_pool.lock),
 	.pool = blocking_pool_data
 };
 
@@ -433,7 +433,7 @@ static struct entropy_store nonblocking_pool = {
 	.poolinfo = &poolinfo_table[1],
 	.name = "nonblocking",
 	.pull = &input_pool,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = __SPIN_LOCK_UNLOCKED(&nonblocking_pool.lock),
 	.pool = nonblocking_pool_data
 };
 
diff --git a/fs/dcache.c b/fs/dcache.c
index c6e3535be192..bec4de176c81 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,7 +38,7 @@ int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
-static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(dcache_lock);
 
diff --git a/include/linux/idr.h b/include/linux/idr.h
index f559a719dbe8..826803449db7 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -66,7 +66,7 @@ struct idr {
 	.id_free	= NULL,					\
 	.layers 	= 0,					\
 	.id_free_cnt	= 0,					\
-	.lock		= SPIN_LOCK_UNLOCKED,			\
+	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),	\
 }
 #define DEFINE_IDR(name)	struct idr name = IDR_INIT(name)
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 678c1a90380d..1b7bb37624bb 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -21,7 +21,7 @@
 	.count		= ATOMIC_INIT(1), 		\
 	.fdt		= &init_files.fdtab, 		\
 	.fdtab		= INIT_FDTABLE,			\
-	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
 	.next_fd	= 0, 				\
 	.close_on_exec_init = { { 0, } }, 		\
 	.open_fds_init	= { { 0, } }, 			\
@@ -36,7 +36,7 @@
 	.user_id	= 0,				\
 	.next		= NULL,				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(name.wait), \
-	.ctx_lock	= SPIN_LOCK_UNLOCKED,		\
+	.ctx_lock	= __SPIN_LOCK_UNLOCKED(name.ctx_lock), \
 	.reqs_active	= 0U,				\
 	.max_reqs	= ~0U,				\
 }
@@ -48,7 +48,7 @@
 	.mm_users	= ATOMIC_INIT(2), 			\
 	.mm_count	= ATOMIC_INIT(1), 			\
 	.mmap_sem	= __RWSEM_INITIALIZER(name.mmap_sem),	\
-	.page_table_lock =  SPIN_LOCK_UNLOCKED, 		\
+	.page_table_lock =  __SPIN_LOCK_UNLOCKED(name.page_table_lock),	\
 	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
 	.cpu_vm_mask	= CPU_MASK_ALL,				\
 }
@@ -69,7 +69,7 @@
 #define INIT_SIGHAND(sighand) {						\
 	.count		= ATOMIC_INIT(1), 				\
 	.action		= { { { .sa_handler = NULL, } }, },		\
-	.siglock	= SPIN_LOCK_UNLOCKED, 				\
+	.siglock	= __SPIN_LOCK_UNLOCKED(sighand.siglock),	\
 }
 
 extern struct group_info init_groups;
@@ -119,7 +119,7 @@ extern struct group_info init_groups;
 		.list = LIST_HEAD_INIT(tsk.pending.list),		\
 		.signal = {{0}}},					\
 	.blocked	= {{0}},					\
-	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 51dbab9710c7..7ff386a6ae87 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -65,7 +65,7 @@ struct raw_notifier_head {
 	} while (0)
 
 #define ATOMIC_NOTIFIER_INIT(name) {				\
-		.lock = SPIN_LOCK_UNLOCKED,			\
+		.lock = __SPIN_LOCK_UNLOCKED(name.lock),	\
 		.head = NULL }
 #define BLOCKING_NOTIFIER_INIT(name) {				\
 		.rwsem = __RWSEM_INITIALIZER((name).rwsem),	\
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 7bc5c7c12b54..46000936f8f1 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -38,9 +38,17 @@ typedef struct {
  * These macros triggered gcc-3.x compile-time problems.  We think these are
  * OK now.  Be cautious.
  */
-#define SEQLOCK_UNLOCKED { 0, SPIN_LOCK_UNLOCKED }
-#define seqlock_init(x)	do { *(x) = (seqlock_t) SEQLOCK_UNLOCKED; } while (0)
+#define __SEQLOCK_UNLOCKED(lockname) \
+		 { 0, __SPIN_LOCK_UNLOCKED(lockname) }
 
+#define SEQLOCK_UNLOCKED \
+		 __SEQLOCK_UNLOCKED(old_style_seqlock_init)
+
+#define seqlock_init(x) \
+		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); } while (0)
+
+#define DEFINE_SEQLOCK(x) \
+		seqlock_t x = __SEQLOCK_UNLOCKED(x)
 
 /* Lock out other writers and update the count.
  * Acts like a normal spin_lock/unlock.
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 9cb51e070390..f5d4ed7bc785 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -44,24 +44,27 @@ typedef struct {
 #define SPINLOCK_OWNER_INIT	((void *)-1L)
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-# define SPIN_LOCK_UNLOCKED						\
+# define __SPIN_LOCK_UNLOCKED(lockname)					\
 	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
 				.magic = SPINLOCK_MAGIC,		\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
-#define RW_LOCK_UNLOCKED						\
+#define __RW_LOCK_UNLOCKED(lockname)					\
 	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				.magic = RWLOCK_MAGIC,			\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
 #else
-# define SPIN_LOCK_UNLOCKED \
+# define __SPIN_LOCK_UNLOCKED(lockname) \
 	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
-#define RW_LOCK_UNLOCKED \
+#define __RW_LOCK_UNLOCKED(lockname) \
 	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
 #endif
 
-#define DEFINE_SPINLOCK(x)	spinlock_t x = SPIN_LOCK_UNLOCKED
-#define DEFINE_RWLOCK(x)	rwlock_t x = RW_LOCK_UNLOCKED
+#define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(old_style_spin_init)
+#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(old_style_rw_init)
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+#define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 544e855c7c02..bc4f389c49bb 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -68,7 +68,7 @@ struct task_struct;
 	wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
-	.lock		= SPIN_LOCK_UNLOCKED,				\
+	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),		\
 	.task_list	= { &(name).task_list, &(name).task_list } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f464f5ae3f11..759805c9859a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,13 +53,13 @@
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 5a8960253063..4dd9a10d67d0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1208,7 +1208,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES;
  * playing with xtime and avenrun.
  */
 #ifndef ARCH_HAVE_XTIME_LOCK
-seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
 EXPORT_SYMBOL(xtime_lock);
 #endif
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fccbd9bba77b..5f7cf2a4cb55 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -38,7 +38,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= RW_LOCK_UNLOCKED,
+	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8355b729fa95..823717285c6d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -90,7 +90,7 @@ static struct socket *tcp_socket;
 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
 
 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
-	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_lock	= __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 };
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e0851697ad5e..0ccb7cb22b15 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -40,7 +40,7 @@ int sysctl_tcp_abort_on_overflow;
 struct inet_timewait_death_row tcp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
 	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.death_lock	= __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
 	.hashinfo	= &tcp_hashinfo,
 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
 					    (unsigned long)&tcp_death_row),
-- 
cgit v1.2.3


From 3ac94932a2c859e8c57a8af091fa39334e1c3f23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:36 -0700
Subject: [PATCH] lockdep: beautify x86_64 stacktraces

Beautify x86_64 stacktraces to be more readable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c |  2 +-
 arch/x86_64/kernel/traps.c   | 70 ++++++++++++++++++++------------------------
 arch/x86_64/mm/fault.c       |  1 -
 include/asm-x86_64/kdebug.h  |  2 +-
 4 files changed, 33 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index ca56e19b8b6e..bb6745d13b8f 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -296,7 +296,7 @@ void __show_regs(struct pt_regs * regs)
 		system_utsname.version);
 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
 	printk_address(regs->rip); 
-	printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
+	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
 		regs->eflags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
 	       regs->rax, regs->rbx, regs->rcx);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 5a5311d3de0f..02dc155da56f 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -110,28 +110,31 @@ static int kstack_depth_to_print = 12;
 static int call_trace = 1;
 
 #ifdef CONFIG_KALLSYMS
-#include <linux/kallsyms.h> 
-int printk_address(unsigned long address)
-{ 
+# include <linux/kallsyms.h>
+void printk_address(unsigned long address)
+{
 	unsigned long offset = 0, symsize;
 	const char *symname;
 	char *modname;
-	char *delim = ":"; 
+	char *delim = ":";
 	char namebuf[128];
 
-	symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 
-	if (!symname) 
-		return printk("[<%016lx>]", address);
-	if (!modname) 
+	symname = kallsyms_lookup(address, &symsize, &offset,
+					&modname, namebuf);
+	if (!symname) {
+		printk(" [<%016lx>]\n", address);
+		return;
+	}
+	if (!modname)
 		modname = delim = ""; 		
-        return printk("<%016lx>{%s%s%s%s%+ld}",
-		      address, delim, modname, delim, symname, offset); 
-} 
+	printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
+		address, delim, modname, delim, symname, offset, symsize);
+}
 #else
-int printk_address(unsigned long address)
-{ 
-	return printk("[<%016lx>]", address);
-} 
+void printk_address(unsigned long address)
+{
+	printk(" [<%016lx>]\n", address);
+}
 #endif
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -193,20 +196,14 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 
 static int show_trace_unwind(struct unwind_frame_info *info, void *context)
 {
-	int i = 11, n = 0;
+	int n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
-		++n;
-		if (i > 50) {
-			printk("\n       ");
-			i = 7;
-		} else
-			i += printk(" ");
-		i += printk_address(UNW_PC(info));
+		n++;
+		printk_address(UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
 	}
-	printk("\n");
 	return n;
 }
 
@@ -224,7 +221,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 	int i = 11;
 	unsigned used = 0;
 
-	printk("\nCall Trace:");
+	printk("\nCall Trace:\n");
 
 	if (!tsk)
 		tsk = current;
@@ -254,12 +251,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 	do while (cond) { \
 		unsigned long addr = *stack++; \
 		if (kernel_text_address(addr)) { \
-			if (i > 50) { \
-				printk("\n       "); \
-				i = 0; \
-			} \
-			else \
-				i += printk(" "); \
 			/* \
 			 * If the address is either in the text segment of the \
 			 * kernel, or in the region which contains vmalloc'ed \
@@ -268,20 +259,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			 * down the cause of the crash will be able to figure \
 			 * out the call path that was taken. \
 			 */ \
-			i += printk_address(addr); \
+			printk_address(addr); \
 		} \
 	} while (0)
 
-	for(; ; ) {
+	for( ; ; ) {
 		const char *id;
 		unsigned long *estack_end;
 		estack_end = in_exception_stack(cpu, (unsigned long)stack,
 						&used, &id);
 
 		if (estack_end) {
-			i += printk(" <%s>", id);
+			printk(" <%s>", id);
 			HANDLE_STACK (stack < estack_end);
-			i += printk(" <EOE>");
+			printk(" <EOE>");
 			stack = (unsigned long *) estack_end[-2];
 			continue;
 		}
@@ -291,11 +282,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 				(IRQSTACKSIZE - 64) / sizeof(*irqstack);
 
 			if (stack >= irqstack && stack < irqstack_end) {
-				i += printk(" <IRQ>");
+				printk(" <IRQ>");
 				HANDLE_STACK (stack < irqstack_end);
 				stack = (unsigned long *) (irqstack_end[-1]);
 				irqstack_end = NULL;
-				i += printk(" <EOI>");
+				printk(" <EOI>");
 				continue;
 			}
 		}
@@ -304,6 +295,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 
 	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
 #undef HANDLE_STACK
+
 	printk("\n");
 }
 
@@ -337,8 +329,8 @@ static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned
 			break;
 		}
 		if (i && ((i % 4) == 0))
-			printk("\n       ");
-		printk("%016lx ", *stack++);
+			printk("\n");
+		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
 	show_trace(tsk, regs, rsp);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 5afcf6eb00fa..ac8ea66ccb94 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -570,7 +570,6 @@ no_context:
 		printk(KERN_ALERT "Unable to handle kernel paging request");
 	printk(" at %016lx RIP: \n" KERN_ALERT,address);
 	printk_address(regs->rip);
-	printk("\n");
 	dump_pagetable(address);
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h
index cd52c7f33bca..2b0c088e2957 100644
--- a/include/asm-x86_64/kdebug.h
+++ b/include/asm-x86_64/kdebug.h
@@ -49,7 +49,7 @@ static inline int notify_die(enum die_val val, const char *str,
 	return atomic_notifier_call_chain(&die_chain, val, &args);
 } 
 
-extern int printk_address(unsigned long address);
+extern void printk_address(unsigned long address);
 extern void die(const char *,struct pt_regs *,long);
 extern void __die(const char *,struct pt_regs *,long);
 extern void show_registers(struct pt_regs *regs);
-- 
cgit v1.2.3


From 8637c09901049f061b94f684915d4f18ecf91d79 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:38 -0700
Subject: [PATCH] lockdep: stacktrace subsystem, core

Framework to generate and save stacktraces quickly, without printing anything
to the console.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/stacktrace.h | 20 ++++++++++++++++++++
 kernel/Makefile            |  1 +
 kernel/stacktrace.c        | 24 ++++++++++++++++++++++++
 lib/Kconfig.debug          |  6 +++++-
 4 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/stacktrace.h
 create mode 100644 kernel/stacktrace.c

(limited to 'include')

diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
new file mode 100644
index 000000000000..9cc81e572224
--- /dev/null
+++ b/include/linux/stacktrace.h
@@ -0,0 +1,20 @@
+#ifndef __LINUX_STACKTRACE_H
+#define __LINUX_STACKTRACE_H
+
+#ifdef CONFIG_STACKTRACE
+struct stack_trace {
+	unsigned int nr_entries, max_entries;
+	unsigned long *entries;
+};
+
+extern void save_stack_trace(struct stack_trace *trace,
+			     struct task_struct *task, int all_contexts,
+			     unsigned int skip);
+
+extern void print_stack_trace(struct stack_trace *trace, int spaces);
+#else
+# define save_stack_trace(trace, task, all, skip)	do { } while (0)
+# define print_stack_trace(trace)			do { } while (0)
+#endif
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 82fb182f6f61..e7fd20e70f1b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,6 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o
 
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
new file mode 100644
index 000000000000..b71816e47a30
--- /dev/null
+++ b/kernel/stacktrace.c
@@ -0,0 +1,24 @@
+/*
+ * kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+	int i, j;
+
+	for (i = 0; i < trace->nr_entries; i++) {
+		unsigned long ip = trace->entries[i];
+
+		for (j = 0; j < spaces + 1; j++)
+			printk(" ");
+		print_ip_sym(ip);
+	}
+}
+
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7b3863d4f5e8..04e374c6fd46 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -107,7 +107,7 @@ config DEBUG_SLAB_LEAK
 
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
-	depends on DEBUG_KERNEL && PREEMPT
+	depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT
 	default y
 	help
 	  If you say Y here then the kernel will use a debug variant of the
@@ -149,6 +149,10 @@ config DEBUG_SPINLOCK_SLEEP
 	  If you say Y here, various routines which may sleep will become very
 	  noisy if they are called with a spinlock held.
 
+config STACKTRACE
+	bool
+	depends on STACKTRACE_SUPPORT
+
 config DEBUG_KOBJECT
 	bool "kobject debugging"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:42 -0700
Subject: [PATCH] lockdep: irqtrace subsystem, core

Accurate hard-IRQ-flags and softirq-flags state tracing.

This allows us to attach extra functionality to IRQ flags on/off
events (such as trace-on/off).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/irq.c      |   2 +-
 include/asm-powerpc/irqflags.h |  31 ++++++++++
 include/linux/hardirq.h        |  26 +++++++-
 include/linux/init_task.h      |   2 +
 include/linux/interrupt.h      |  11 ++--
 include/linux/irqflags.h       |  96 +++++++++++++++++++++++++++++
 include/linux/sched.h          |  15 +++++
 kernel/fork.c                  |  19 ++++++
 kernel/sched.c                 |   4 +-
 kernel/softirq.c               | 137 +++++++++++++++++++++++++++++++++++------
 10 files changed, 313 insertions(+), 30 deletions(-)
 create mode 100644 include/asm-powerpc/irqflags.h
 create mode 100644 include/linux/irqflags.h

(limited to 'include')

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 525baab45d2d..027728b95429 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -429,7 +429,7 @@ void do_softirq(void)
 		local_bh_disable();
 		do_softirq_onstack();
 		account_system_vtime(current);
-		__local_bh_enable();
+		_local_bh_enable();
 	}
 
 	local_irq_restore(flags);
diff --git a/include/asm-powerpc/irqflags.h b/include/asm-powerpc/irqflags.h
new file mode 100644
index 000000000000..7970cbaeaa54
--- /dev/null
+++ b/include/asm-powerpc/irqflags.h
@@ -0,0 +1,31 @@
+/*
+ * include/asm-powerpc/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+/*
+ * Get definitions for raw_local_save_flags(x), etc.
+ */
+#include <asm-powerpc/hw_irq.h>
+
+/*
+ * Do the CPU's IRQ-state tracing from assembly code. We call a
+ * C function, so save all the C-clobbered registers:
+ */
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+#error No support on PowerPC yet for CONFIG_TRACE_IRQFLAGS
+
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+
+#endif
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 114ae583cca9..b1d4332b5cf0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -86,9 +86,6 @@ extern void synchronize_irq(unsigned int irq);
 # define synchronize_irq(irq)	barrier()
 #endif
 
-#define nmi_enter()		irq_enter()
-#define nmi_exit()		sub_preempt_count(HARDIRQ_OFFSET)
-
 struct task_struct;
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
@@ -97,12 +94,35 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
+/*
+ * It is safe to do non-atomic ops on ->hardirq_context,
+ * because NMI handlers may not preempt and the ops are
+ * always balanced, so the interrupted value of ->hardirq_context
+ * will always be restored.
+ */
 #define irq_enter()					\
 	do {						\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
+		trace_hardirq_enter();			\
+	} while (0)
+
+/*
+ * Exit irq context without processing softirqs:
+ */
+#define __irq_exit()					\
+	do {						\
+		trace_hardirq_exit();			\
+		account_system_vtime(current);		\
+		sub_preempt_count(HARDIRQ_OFFSET);	\
 	} while (0)
 
+/*
+ * Exit irq context and process softirqs if needed:
+ */
 extern void irq_exit(void);
 
+#define nmi_enter()		irq_enter()
+#define nmi_exit()		__irq_exit()
+
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1b7bb37624bb..444a3ae0de2a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -3,6 +3,7 @@
 
 #include <linux/file.h>
 #include <linux/rcupdate.h>
+#include <linux/irqflags.h>
 
 #define INIT_FDTABLE \
 {							\
@@ -124,6 +125,7 @@ extern struct group_info init_groups;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
 	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
+	INIT_TRACE_IRQFLAGS						\
 }
 
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 73463fbb38e4..d5afee95fd43 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -10,6 +10,7 @@
 #include <linux/irqreturn.h>
 #include <linux/hardirq.h>
 #include <linux/sched.h>
+#include <linux/irqflags.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
@@ -199,13 +200,11 @@ static inline void __deprecated save_and_cli(unsigned long *x)
 #define save_and_cli(x)	save_and_cli(&x)
 #endif /* CONFIG_SMP */
 
-/* SoftIRQ primitives.  */
-#define local_bh_disable() \
-		do { add_preempt_count(SOFTIRQ_OFFSET); barrier(); } while (0)
-#define __local_bh_enable() \
-		do { barrier(); sub_preempt_count(SOFTIRQ_OFFSET); } while (0)
-
+extern void local_bh_disable(void);
+extern void __local_bh_enable(void);
+extern void _local_bh_enable(void);
 extern void local_bh_enable(void);
+extern void local_bh_enable_ip(unsigned long ip);
 
 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    frequency threaded job scheduling. For almost all the purposes
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
new file mode 100644
index 000000000000..412e025bc5c7
--- /dev/null
+++ b/include/linux/irqflags.h
@@ -0,0 +1,96 @@
+/*
+ * include/linux/irqflags.h
+ *
+ * IRQ flags tracing: follow the state of the hardirq and softirq flags and
+ * provide callbacks for transitions between ON and OFF states.
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _LINUX_TRACE_IRQFLAGS_H
+#define _LINUX_TRACE_IRQFLAGS_H
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+  extern void trace_hardirqs_on(void);
+  extern void trace_hardirqs_off(void);
+  extern void trace_softirqs_on(unsigned long ip);
+  extern void trace_softirqs_off(unsigned long ip);
+# define trace_hardirq_context(p)	((p)->hardirq_context)
+# define trace_softirq_context(p)	((p)->softirq_context)
+# define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
+# define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
+# define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
+# define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
+# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
+# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
+# define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
+#else
+# define trace_hardirqs_on()		do { } while (0)
+# define trace_hardirqs_off()		do { } while (0)
+# define trace_softirqs_on(ip)		do { } while (0)
+# define trace_softirqs_off(ip)		do { } while (0)
+# define trace_hardirq_context(p)	0
+# define trace_softirq_context(p)	0
+# define trace_hardirqs_enabled(p)	0
+# define trace_softirqs_enabled(p)	0
+# define trace_hardirq_enter()		do { } while (0)
+# define trace_hardirq_exit()		do { } while (0)
+# define trace_softirq_enter()		do { } while (0)
+# define trace_softirq_exit()		do { } while (0)
+# define INIT_TRACE_IRQFLAGS
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+
+#include <asm/irqflags.h>
+
+#define local_irq_enable() \
+	do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
+#define local_irq_disable() \
+	do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
+#define local_irq_save(flags) \
+	do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0)
+
+#define local_irq_restore(flags)				\
+	do {							\
+		if (raw_irqs_disabled_flags(flags)) {		\
+			raw_local_irq_restore(flags);		\
+			trace_hardirqs_off();			\
+		} else {					\
+			trace_hardirqs_on();			\
+			raw_local_irq_restore(flags);		\
+		}						\
+	} while (0)
+#else /* !CONFIG_TRACE_IRQFLAGS_SUPPORT */
+/*
+ * The local_irq_*() APIs are equal to the raw_local_irq*()
+ * if !TRACE_IRQFLAGS.
+ */
+# define raw_local_irq_disable()	local_irq_disable()
+# define raw_local_irq_enable()		local_irq_enable()
+# define raw_local_irq_save(flags)	local_irq_save(flags)
+# define raw_local_irq_restore(flags)	local_irq_restore(flags)
+#endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
+
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+#define safe_halt()						\
+	do {							\
+		trace_hardirqs_on();				\
+		raw_safe_halt();				\
+	} while (0)
+
+#define local_save_flags(flags)		raw_local_save_flags(flags)
+
+#define irqs_disabled()						\
+({								\
+	unsigned long flags;					\
+								\
+	raw_local_save_flags(flags);				\
+	raw_irqs_disabled_flags(flags);				\
+})
+
+#define irqs_disabled_flags(flags)	raw_irqs_disabled_flags(flags)
+#endif		/* CONFIG_X86 */
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bdabeee10a78..ad7a89014d29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -871,6 +871,21 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+	unsigned int irq_events;
+	int hardirqs_enabled;
+	unsigned long hardirq_enable_ip;
+	unsigned int hardirq_enable_event;
+	unsigned long hardirq_disable_ip;
+	unsigned int hardirq_disable_event;
+	int softirqs_enabled;
+	unsigned long softirq_disable_ip;
+	unsigned int softirq_disable_event;
+	unsigned long softirq_enable_ip;
+	unsigned int softirq_enable_event;
+	int hardirq_context;
+	int softirq_context;
+#endif
 
 /* journalling filesystem info */
 	void *journal_info;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1cd46a4fb0d3..b7db7fb74f53 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -968,6 +968,10 @@ static task_t *copy_process(unsigned long clone_flags,
 	if (!p)
 		goto fork_out;
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
+	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
+#endif
 	retval = -EAGAIN;
 	if (atomic_read(&p->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -1042,6 +1046,21 @@ static task_t *copy_process(unsigned long clone_flags,
  	}
 	mpol_fix_fork_child_flag(p);
 #endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+	p->irq_events = 0;
+	p->hardirqs_enabled = 0;
+	p->hardirq_enable_ip = 0;
+	p->hardirq_enable_event = 0;
+	p->hardirq_disable_ip = _THIS_IP_;
+	p->hardirq_disable_event = 0;
+	p->softirqs_enabled = 1;
+	p->softirq_enable_ip = _THIS_IP_;
+	p->softirq_enable_event = 0;
+	p->softirq_disable_ip = 0;
+	p->softirq_disable_event = 0;
+	p->hardirq_context = 0;
+	p->softirq_context = 0;
+#endif
 
 	rt_mutex_init_task(p);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 48c1faa60a67..911829966534 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4462,7 +4462,9 @@ int __sched cond_resched_softirq(void)
 	BUG_ON(!in_softirq());
 
 	if (need_resched() && __resched_legal()) {
-		__local_bh_enable();
+		raw_local_irq_disable();
+		_local_bh_enable();
+		raw_local_irq_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8f03e3b89b55..584609b6a66e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -61,6 +61,119 @@ static inline void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
+/*
+ * This one is for softirq.c-internal use,
+ * where hardirqs are disabled legitimately:
+ */
+static void __local_bh_disable(unsigned long ip)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+
+	raw_local_irq_save(flags);
+	add_preempt_count(SOFTIRQ_OFFSET);
+	/*
+	 * Were softirqs turned off above:
+	 */
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_off(ip);
+	raw_local_irq_restore(flags);
+}
+
+void local_bh_disable(void)
+{
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+}
+
+EXPORT_SYMBOL(local_bh_disable);
+
+void __local_bh_enable(void)
+{
+	WARN_ON_ONCE(in_irq());
+
+	/*
+	 * softirqs should never be enabled by __local_bh_enable(),
+	 * it always nests inside local_bh_enable() sections:
+	 */
+	WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
+
+	sub_preempt_count(SOFTIRQ_OFFSET);
+}
+EXPORT_SYMBOL_GPL(__local_bh_enable);
+
+/*
+ * Special-case - softirqs can safely be enabled in
+ * cond_resched_softirq(), or by __do_softirq(),
+ * without processing still-pending softirqs:
+ */
+void _local_bh_enable(void)
+{
+	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(!irqs_disabled());
+
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_on((unsigned long)__builtin_return_address(0));
+	sub_preempt_count(SOFTIRQ_OFFSET);
+}
+
+EXPORT_SYMBOL(_local_bh_enable);
+
+void local_bh_enable(void)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(irqs_disabled());
+
+	local_irq_save(flags);
+	/*
+	 * Are softirqs going to be turned on now:
+	 */
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_on((unsigned long)__builtin_return_address(0));
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
+		do_softirq();
+
+	dec_preempt_count();
+	local_irq_restore(flags);
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+void local_bh_enable_ip(unsigned long ip)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+
+	local_irq_save(flags);
+	/*
+	 * Are softirqs going to be turned on now:
+	 */
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_on(ip);
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
+		do_softirq();
+
+	dec_preempt_count();
+	local_irq_restore(flags);
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -80,8 +193,9 @@ asmlinkage void __do_softirq(void)
 	int cpu;
 
 	pending = local_softirq_pending();
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	trace_softirq_enter();
 
-	local_bh_disable();
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
@@ -109,7 +223,8 @@ restart:
 	if (pending)
 		wakeup_softirqd();
 
-	__local_bh_enable();
+	trace_softirq_exit();
+	_local_bh_enable();
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -136,23 +251,6 @@ EXPORT_SYMBOL(do_softirq);
 
 #endif
 
-void local_bh_enable(void)
-{
-	WARN_ON(irqs_disabled());
-	/*
-	 * Keep preemption disabled until we are done with
-	 * softirq processing:
- 	 */
- 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
-
-	if (unlikely(!in_interrupt() && local_softirq_pending()))
-		do_softirq();
-
-	dec_preempt_count();
-	preempt_check_resched();
-}
-EXPORT_SYMBOL(local_bh_enable);
-
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 # define invoke_softirq()	__do_softirq()
 #else
@@ -165,6 +263,7 @@ EXPORT_SYMBOL(local_bh_enable);
 void irq_exit(void)
 {
 	account_system_vtime(current);
+	trace_hardirq_exit();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
-- 
cgit v1.2.3


From 55f327fa9e876758491a82af7491104f1cc3fc4d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:43 -0700
Subject: [PATCH] lockdep: irqtrace subsystem, i386 support

Add irqflags-tracing support to i386.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig.debug     |  4 ++++
 arch/i386/kernel/entry.S    | 36 ++++++++++++++++++++++++++---
 arch/i386/kernel/irq.c      |  6 ++++-
 include/asm-i386/irqflags.h | 56 +++++++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/spinlock.h |  5 ++++
 include/asm-i386/system.h   | 20 +---------------
 6 files changed, 104 insertions(+), 23 deletions(-)
 create mode 100644 include/asm-i386/irqflags.h

(limited to 'include')

diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index b20ddd04f3d8..b31c0802e1cc 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -1,5 +1,9 @@
 menu "Kernel hacking"
 
+config TRACE_IRQFLAGS_SUPPORT
+	bool
+	default y
+
 source "lib/Kconfig.debug"
 
 config EARLY_PRINTK
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 787190c45fdb..d9a260f2efb4 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -42,6 +42,7 @@
 
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
+#include <asm/irqflags.h>
 #include <asm/errno.h>
 #include <asm/segment.h>
 #include <asm/smp.h>
@@ -76,12 +77,21 @@ NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli
+#define preempt_stop		cli; TRACE_IRQS_OFF
 #else
 #define preempt_stop
 #define resume_kernel		restore_nocheck
 #endif
 
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	testl $IF_MASK,EFLAGS(%esp)     # interrupts off?
+	jz 1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
 #ifdef CONFIG_VM86
 #define resume_userspace_sig	check_userspace
 #else
@@ -257,6 +267,10 @@ ENTRY(sysenter_entry)
 	CFI_REGISTER esp, ebp
 	movl TSS_sysenter_esp0(%esp),%esp
 sysenter_past_esp:
+	/*
+	 * No need to follow this irqs on/off section: the syscall
+	 * disabled irqs and here we enable it straight after entry:
+	 */
 	sti
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
@@ -303,6 +317,7 @@ sysenter_past_esp:
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
 	cli
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
@@ -310,6 +325,7 @@ sysenter_past_esp:
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
+	TRACE_IRQS_ON
 	sti
 	sysexit
 	CFI_ENDPROC
@@ -339,6 +355,7 @@ syscall_exit:
 	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
@@ -355,12 +372,15 @@ restore_all:
 	CFI_REMEMBER_STATE
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
+	TRACE_IRQS_IRET
+restore_nocheck_notrace:
 	RESTORE_REGS
 	addl $4, %esp
 	CFI_ADJUST_CFA_OFFSET -4
 1:	iret
 .section .fixup,"ax"
 iret_exc:
+	TRACE_IRQS_ON
 	sti
 	pushl $0			# no error code
 	pushl $do_iret_error
@@ -386,11 +406,13 @@ ldt_ss:
 	subl $8, %esp		# reserve space for switch16 pointer
 	CFI_ADJUST_CFA_OFFSET 8
 	cli
+	TRACE_IRQS_OFF
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
 	 * and a switch16 pointer on top of the current frame. */
 	call setup_x86_bogus_stack
 	CFI_ADJUST_CFA_OFFSET -8	# frame has moved
+	TRACE_IRQS_IRET
 	RESTORE_REGS
 	lss 20+4(%esp), %esp	# switch to 16bit stack
 1:	iret
@@ -411,6 +433,7 @@ work_resched:
 	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
@@ -462,6 +485,7 @@ syscall_trace_entry:
 syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
+	TRACE_IRQS_ON
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
@@ -535,9 +559,14 @@ ENTRY(irq_entries_start)
 vector=vector+1
 .endr
 
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
 	ALIGN
 common_interrupt:
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	movl %esp,%eax
 	call do_IRQ
 	jmp ret_from_intr
@@ -549,9 +578,10 @@ ENTRY(name)				\
 	pushl $~(nr);			\
 	CFI_ADJUST_CFA_OFFSET 4;	\
 	SAVE_ALL;			\
+	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
 	call smp_/**/name;		\
-	jmp ret_from_intr;	\
+	jmp ret_from_intr;		\
 	CFI_ENDPROC
 
 /* The include is where all of the SMP etc. interrupts come from */
@@ -726,7 +756,7 @@ nmi_stack_correct:
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
-	jmp restore_all
+	jmp restore_nocheck_notrace
 	CFI_ENDPROC
 
 nmi_stack_fixup:
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 16b491703967..6cb529f60dcc 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -166,7 +166,7 @@ void irq_ctx_init(int cpu)
 	irqctx->tinfo.task              = NULL;
 	irqctx->tinfo.exec_domain       = NULL;
 	irqctx->tinfo.cpu               = cpu;
-	irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
+	irqctx->tinfo.preempt_count     = 0;
 	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
 
 	softirq_ctx[cpu] = irqctx;
@@ -211,6 +211,10 @@ asmlinkage void do_softirq(void)
 			: "0"(isp)
 			: "memory", "cc", "edx", "ecx", "eax"
 		);
+		/*
+		 * Shouldnt happen, we returned above if in_interrupt():
+	 	 */
+		WARN_ON_ONCE(softirq_count());
 	}
 
 	local_irq_restore(flags);
diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h
new file mode 100644
index 000000000000..ca777e894c92
--- /dev/null
+++ b/include/asm-i386/irqflags.h
@@ -0,0 +1,56 @@
+/*
+ * include/asm-i386/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+#define raw_local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
+#define raw_local_irq_restore(x) do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
+#define raw_local_irq_disable()	__asm__ __volatile__("cli": : :"memory")
+#define raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+/* used in the idle loop; sti takes one instruction cycle to complete */
+#define raw_safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
+
+#define raw_irqs_disabled_flags(flags)	(!((flags) & (1<<9)))
+
+/* For spinlocks etc */
+#define raw_local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+
+/*
+ * Do the CPU's IRQ-state tracing from assembly code. We call a
+ * C function, so save all the C-clobbered registers:
+ */
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+# define TRACE_IRQS_ON				\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+# define TRACE_IRQS_OFF				\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+
+#endif
diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h
index 04ba30234c48..7e29b51bcaa0 100644
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -31,6 +31,11 @@
 	"jmp 1b\n" \
 	"3:\n\t"
 
+/*
+ * NOTE: there's an irqs-on section here, which normally would have to be
+ * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use
+ * __raw_spin_lock_string_flags().
+ */
 #define __raw_spin_lock_string_flags \
 	"\n1:\t" \
 	"lock ; decb %0\n\t" \
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index cab0180567f9..db398d88b1d9 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -456,25 +456,7 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long l
 
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
-/* interrupt control.. */
-#define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
-#define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
-/* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
-})
-
-/* For spinlocks etc */
-#define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+#include <linux/irqflags.h>
 
 /*
  * disable hlt during certain critical i/o operations
-- 
cgit v1.2.3


From c8558fcdecb1f920df8050be4f2d5f499060030e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:44 -0700
Subject: [PATCH] lockdep: irqtrace cleanup of include/asm-i386/irqflags.h

Clean up the x86 irqflags.h file:

 - macro => inline function transformation
 - simplifications
 - style fixes

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/irqflags.h | 95 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h
index ca777e894c92..e1bdb97c07fa 100644
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -5,24 +5,95 @@
  *
  * This file gets included from lowlevel asm headers too, to provide
  * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() macros from the lowlevel headers.
+ * raw_local_irq_*() functions from the lowlevel headers.
  */
 #ifndef _ASM_IRQFLAGS_H
 #define _ASM_IRQFLAGS_H
 
-#define raw_local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
-#define raw_local_irq_restore(x) do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
-#define raw_local_irq_disable()	__asm__ __volatile__("cli": : :"memory")
-#define raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
-/* used in the idle loop; sti takes one instruction cycle to complete */
-#define raw_safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
+#ifndef __ASSEMBLY__
 
-#define raw_irqs_disabled_flags(flags)	(!((flags) & (1<<9)))
+static inline unsigned long __raw_local_save_flags(void)
+{
+	unsigned long flags;
 
-/* For spinlocks etc */
-#define raw_local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+	__asm__ __volatile__(
+		"pushfl ; popl %0"
+		: "=g" (flags)
+		: /* no input */
+	);
+
+	return flags;
+}
+
+#define raw_local_save_flags(flags) \
+		do { (flags) = __raw_local_save_flags(); } while (0)
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+	__asm__ __volatile__(
+		"pushl %0 ; popfl"
+		: /* no output */
+		:"g" (flags)
+		:"memory", "cc"
+	);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	__asm__ __volatile__("cli" : : : "memory");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	__asm__ __volatile__("sti" : : : "memory");
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+	__asm__ __volatile__("sti; hlt" : : : "memory");
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+	__asm__ __volatile__("hlt": : :"memory");
+}
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	return raw_irqs_disabled_flags(flags);
+}
+
+/*
+ * For spinlocks, etc:
+ */
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	raw_local_irq_disable();
+
+	return flags;
+}
+
+#define raw_local_irq_save(flags) \
+		do { (flags) = __raw_local_irq_save(); } while (0)
+
+#endif /* __ASSEMBLY__ */
 
 /*
  * Do the CPU's IRQ-state tracing from assembly code. We call a
-- 
cgit v1.2.3


From 2601e64d262ee5ed4d4a5737345803800d9c4db3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:45 -0700
Subject: [PATCH] lockdep: irqtrace subsystem, x86_64 support

Add irqflags-tracing support to x86_64.

[akpm@osdl.org: build fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig.debug     |   4 +
 arch/x86_64/ia32/ia32entry.S  |  19 ++++-
 arch/x86_64/kernel/entry.S    | 188 ++++++++++++++++++++++++++++++------------
 arch/x86_64/kernel/irq.c      |   4 +-
 include/asm-x86_64/irqflags.h |  61 ++++++++++++++
 include/asm-x86_64/system.h   |  38 +--------
 6 files changed, 223 insertions(+), 91 deletions(-)
 create mode 100644 include/asm-x86_64/irqflags.h

(limited to 'include')

diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index 1d92ab56c0f9..775d211a5cf9 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -1,5 +1,9 @@
 menu "Kernel hacking"
 
+config TRACE_IRQFLAGS_SUPPORT
+	bool
+	default y
+
 source "lib/Kconfig.debug"
 
 config DEBUG_RODATA
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index c536fa98ea37..9b5bb413a6e9 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -13,6 +13,7 @@
 #include <asm/thread_info.h>	
 #include <asm/segment.h>
 #include <asm/vsyscall32.h>
+#include <asm/irqflags.h>
 #include <linux/linkage.h>
 
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
@@ -75,6 +76,10 @@ ENTRY(ia32_sysenter_target)
 	swapgs
 	movq	%gs:pda_kernelstack, %rsp
 	addq	$(PDA_STACKOFFSET),%rsp	
+	/*
+	 * No need to follow this irqs on/off section: the syscall
+	 * disabled irqs, here we enable it straight after entry:
+	 */
 	sti	
  	movl	%ebp,%ebp		/* zero extension */
 	pushq	$__USER32_DS
@@ -118,6 +123,7 @@ sysenter_do_call:
 	movq	%rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
+	TRACE_IRQS_OFF
 	testl	$_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
 	jnz	int_ret_from_sys_call
 	andl    $~TS_COMPAT,threadinfo_status(%r10)
@@ -132,6 +138,7 @@ sysenter_do_call:
 	CFI_REGISTER rsp,rcx
 	movl	$VSYSCALL32_SYSEXIT,%edx	/* User %eip */
 	CFI_REGISTER rip,rdx
+	TRACE_IRQS_ON
 	swapgs
 	sti		/* sti only takes effect after the next instruction */
 	/* sysexit */
@@ -186,6 +193,10 @@ ENTRY(ia32_cstar_target)
 	movl	%esp,%r8d
 	CFI_REGISTER	rsp,r8
 	movq	%gs:pda_kernelstack,%rsp
+	/*
+	 * No need to follow this irqs on/off section: the syscall
+	 * disabled irqs and here we enable it straight after entry:
+	 */
 	sti
 	SAVE_ARGS 8,1,1
 	movl 	%eax,%eax	/* zero extension */
@@ -220,6 +231,7 @@ cstar_do_call:
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
+	TRACE_IRQS_OFF
 	testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
 	jnz  int_ret_from_sys_call
 	andl $~TS_COMPAT,threadinfo_status(%r10)
@@ -228,6 +240,7 @@ cstar_do_call:
 	CFI_REGISTER rip,rcx
 	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
 	/*CFI_REGISTER rflags,r11*/
+	TRACE_IRQS_ON
 	movl RSP-ARGOFFSET(%rsp),%esp
 	CFI_RESTORE rsp
 	swapgs
@@ -286,7 +299,11 @@ ENTRY(ia32_syscall)
 	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP*/
 	/*CFI_REL_OFFSET	cs,CS-RIP*/
 	CFI_REL_OFFSET	rip,RIP-RIP
-	swapgs	
+	swapgs
+	/*
+	 * No need to follow this irqs on/off section: the syscall
+	 * disabled irqs and here we enable it straight after entry:
+	 */
 	sti
 	movl %eax,%eax
 	pushq %rax
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index ed92c2983606..d464dded68c0 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -41,13 +41,24 @@
 #include <asm/thread_info.h>
 #include <asm/hw_irq.h>
 #include <asm/page.h>
+#include <asm/irqflags.h>
 
 	.code64
 
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
-	
+
+
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+	jnc  1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
 /*
  * C code is not supposed to know about undefined top of stack. Every time 
  * a C function with an pt_regs argument is called from the SYSCALL based 
@@ -194,6 +205,10 @@ ENTRY(system_call)
 	swapgs
 	movq	%rsp,%gs:pda_oldrsp 
 	movq	%gs:pda_kernelstack,%rsp
+	/*
+	 * No need to follow this irqs off/on section - it's straight
+	 * and short:
+	 */
 	sti					
 	SAVE_ARGS 8,1
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
@@ -219,10 +234,15 @@ ret_from_sys_call:
 sysret_check:		
 	GET_THREAD_INFO(%rcx)
 	cli
+	TRACE_IRQS_OFF
 	movl threadinfo_flags(%rcx),%edx
 	andl %edi,%edx
 	CFI_REMEMBER_STATE
 	jnz  sysret_careful 
+	/*
+	 * sysretq will re-enable interrupts:
+	 */
+	TRACE_IRQS_ON
 	movq RIP-ARGOFFSET(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 0,-ARG_SKIP,1
@@ -237,6 +257,7 @@ sysret_careful:
 	CFI_RESTORE_STATE
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
+	TRACE_IRQS_ON
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -247,6 +268,7 @@ sysret_careful:
 
 	/* Handle a signal */ 
 sysret_signal:
+	TRACE_IRQS_ON
 	sti
 	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
 	jz    1f
@@ -261,6 +283,7 @@ sysret_signal:
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	cli
+	TRACE_IRQS_OFF
 	jmp int_with_check
 	
 badsys:
@@ -309,6 +332,7 @@ ENTRY(int_ret_from_sys_call)
 	CFI_REL_OFFSET	r10,R10-ARGOFFSET
 	CFI_REL_OFFSET	r11,R11-ARGOFFSET
 	cli
+	TRACE_IRQS_OFF
 	testl $3,CS-ARGOFFSET(%rsp)
 	je retint_restore_args
 	movl $_TIF_ALLWORK_MASK,%edi
@@ -327,6 +351,7 @@ int_with_check:
 int_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc  int_very_careful
+	TRACE_IRQS_ON
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -334,10 +359,12 @@ int_careful:
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
 	cli
+	TRACE_IRQS_OFF
 	jmp int_with_check
 
 	/* handle signals and tracing -- both require a full stack frame */
 int_very_careful:
+	TRACE_IRQS_ON
 	sti
 	SAVE_REST
 	/* Check for syscall exit trace */	
@@ -351,6 +378,7 @@ int_very_careful:
 	CFI_ADJUST_CFA_OFFSET -8
 	andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
 	cli
+	TRACE_IRQS_OFF
 	jmp int_restore_rest
 	
 int_signal:
@@ -363,6 +391,7 @@ int_signal:
 int_restore_rest:
 	RESTORE_REST
 	cli
+	TRACE_IRQS_OFF
 	jmp int_with_check
 	CFI_ENDPROC
 END(int_ret_from_sys_call)
@@ -484,6 +513,10 @@ END(stub_rt_sigreturn)
 	swapgs	
 1:	incl	%gs:pda_irqcount	# RED-PEN should check preempt count
 	cmoveq %gs:pda_irqstackptr,%rsp
+	/*
+	 * We entered an interrupt context - irqs are off:
+	 */
+	TRACE_IRQS_OFF
 	call \func
 	.endm
 
@@ -493,6 +526,7 @@ ENTRY(common_interrupt)
 	/* 0(%rsp): oldrsp-ARGOFFSET */
 ret_from_intr:
 	cli	
+	TRACE_IRQS_OFF
 	decl %gs:pda_irqcount
 	leaveq
 	CFI_DEF_CFA_REGISTER	rsp
@@ -515,9 +549,21 @@ retint_check:
 	CFI_REMEMBER_STATE
 	jnz  retint_careful
 retint_swapgs:	 	
+	/*
+	 * The iretq could re-enable interrupts:
+	 */
+	cli
+	TRACE_IRQS_IRETQ
 	swapgs 
+	jmp restore_args
+
 retint_restore_args:				
 	cli
+	/*
+	 * The iretq could re-enable interrupts:
+	 */
+	TRACE_IRQS_IRETQ
+restore_args:
 	RESTORE_ARGS 0,8,0						
 iret_label:	
 	iretq
@@ -530,6 +576,7 @@ iret_label:
 	/* running with kernel gs */
 bad_iret:
 	movq $11,%rdi	/* SIGSEGV */
+	TRACE_IRQS_ON
 	sti
 	jmp do_exit			
 	.previous	
@@ -539,6 +586,7 @@ retint_careful:
 	CFI_RESTORE_STATE
 	bt    $TIF_NEED_RESCHED,%edx
 	jnc   retint_signal
+	TRACE_IRQS_ON
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
@@ -547,11 +595,13 @@ retint_careful:
 	CFI_ADJUST_CFA_OFFSET	-8
 	GET_THREAD_INFO(%rcx)
 	cli
+	TRACE_IRQS_OFF
 	jmp retint_check
 	
 retint_signal:
 	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
 	jz    retint_swapgs
+	TRACE_IRQS_ON
 	sti
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp) 			
@@ -560,6 +610,7 @@ retint_signal:
 	call do_notify_resume
 	RESTORE_REST
 	cli
+	TRACE_IRQS_OFF
 	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
@@ -666,7 +717,7 @@ END(spurious_interrupt)
 
 	/* error code is on the stack already */
 	/* handle NMI like exceptions that can happen everywhere */
-	.macro paranoidentry sym, ist=0
+	.macro paranoidentry sym, ist=0, irqtrace=1
 	SAVE_ALL
 	cld
 	movl $1,%ebx
@@ -691,8 +742,73 @@ END(spurious_interrupt)
 	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
 	.endif
 	cli
+	.if \irqtrace
+	TRACE_IRQS_OFF
+	.endif
 	.endm
-	
+
+	/*
+ 	 * "Paranoid" exit path from exception stack.
+  	 * Paranoid because this is used by NMIs and cannot take
+	 * any kernel state for granted.
+	 * We don't do kernel preemption checks here, because only
+	 * NMI should be common and it does not enable IRQs and
+	 * cannot get reschedule ticks.
+	 *
+	 * "trace" is 0 for the NMI handler only, because irq-tracing
+	 * is fundamentally NMI-unsafe. (we cannot change the soft and
+	 * hard flags at once, atomically)
+	 */
+	.macro paranoidexit trace=1
+	/* ebx:	no swapgs flag */
+paranoid_exit\trace:
+	testl %ebx,%ebx				/* swapgs needed? */
+	jnz paranoid_restore\trace
+	testl $3,CS(%rsp)
+	jnz   paranoid_userspace\trace
+paranoid_swapgs\trace:
+	TRACE_IRQS_IRETQ 0
+	swapgs
+paranoid_restore\trace:
+	RESTORE_ALL 8
+	iretq
+paranoid_userspace\trace:
+	GET_THREAD_INFO(%rcx)
+	movl threadinfo_flags(%rcx),%ebx
+	andl $_TIF_WORK_MASK,%ebx
+	jz paranoid_swapgs\trace
+	movq %rsp,%rdi			/* &pt_regs */
+	call sync_regs
+	movq %rax,%rsp			/* switch stack for scheduling */
+	testl $_TIF_NEED_RESCHED,%ebx
+	jnz paranoid_schedule\trace
+	movl %ebx,%edx			/* arg3: thread flags */
+	.if \trace
+	TRACE_IRQS_ON
+	.endif
+	sti
+	xorl %esi,%esi 			/* arg2: oldset */
+	movq %rsp,%rdi 			/* arg1: &pt_regs */
+	call do_notify_resume
+	cli
+	.if \trace
+	TRACE_IRQS_OFF
+	.endif
+	jmp paranoid_userspace\trace
+paranoid_schedule\trace:
+	.if \trace
+	TRACE_IRQS_ON
+	.endif
+	sti
+	call schedule
+	cli
+	.if \trace
+	TRACE_IRQS_OFF
+	.endif
+	jmp paranoid_userspace\trace
+	CFI_ENDPROC
+	.endm
+
 /*
  * Exception entry point. This expects an error code/orig_rax on the stack
  * and the exception handler in %rax.	
@@ -748,6 +864,7 @@ error_exit:
 	movl %ebx,%eax		
 	RESTORE_REST
 	cli
+	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)	
 	testl %eax,%eax
 	jne  retint_kernel
@@ -755,6 +872,10 @@ error_exit:
 	movl  $_TIF_WORK_MASK,%edi
 	andl  %edi,%edx
 	jnz  retint_careful
+	/*
+	 * The iret might restore flags:
+	 */
+	TRACE_IRQS_IRETQ
 	swapgs 
 	RESTORE_ARGS 0,8,0						
 	jmp iret_label
@@ -916,8 +1037,7 @@ KPROBE_ENTRY(debug)
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8		
 	paranoidentry do_debug, DEBUG_STACK
-	jmp paranoid_exit
-	CFI_ENDPROC
+	paranoidexit
 END(debug)
 	.previous .text
 
@@ -926,49 +1046,13 @@ KPROBE_ENTRY(nmi)
 	INTR_FRAME
 	pushq $-1
 	CFI_ADJUST_CFA_OFFSET 8
-	paranoidentry do_nmi
-	/*
- 	 * "Paranoid" exit path from exception stack.
-  	 * Paranoid because this is used by NMIs and cannot take
-	 * any kernel state for granted.
-	 * We don't do kernel preemption checks here, because only
-	 * NMI should be common and it does not enable IRQs and
-	 * cannot get reschedule ticks.
-	 */
-	/* ebx:	no swapgs flag */
-paranoid_exit:
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore
-	testl $3,CS(%rsp)
-	jnz   paranoid_userspace
-paranoid_swapgs:	
-	swapgs
-paranoid_restore:	
-	RESTORE_ALL 8
-	iretq
-paranoid_userspace:	
-	GET_THREAD_INFO(%rcx)
-	movl threadinfo_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz paranoid_swapgs
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz paranoid_schedule
-	movl %ebx,%edx			/* arg3: thread flags */
-	sti
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	cli
-	jmp paranoid_userspace
-paranoid_schedule:
-	sti
-	call schedule
-	cli
-	jmp paranoid_userspace
-	CFI_ENDPROC
+	paranoidentry do_nmi, 0, 0
+#ifdef CONFIG_TRACE_IRQFLAGS
+	paranoidexit 0
+#else
+	jmp paranoid_exit1
+ 	CFI_ENDPROC
+#endif
 END(nmi)
 	.previous .text
 
@@ -977,7 +1061,7 @@ KPROBE_ENTRY(int3)
  	pushq $0
  	CFI_ADJUST_CFA_OFFSET 8
  	paranoidentry do_int3, DEBUG_STACK
- 	jmp paranoid_exit
+ 	jmp paranoid_exit1
  	CFI_ENDPROC
 END(int3)
 	.previous .text
@@ -1006,7 +1090,7 @@ END(reserved)
 ENTRY(double_fault)
 	XCPT_FRAME
 	paranoidentry do_double_fault
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(double_fault)
 
@@ -1022,7 +1106,7 @@ END(segment_not_present)
 ENTRY(stack_segment)
 	XCPT_FRAME
 	paranoidentry do_stack_segment
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(stack_segment)
 
@@ -1050,7 +1134,7 @@ ENTRY(machine_check)
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8	
 	paranoidentry do_machine_check
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(machine_check)
 #endif
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index a1f1df5f7bfc..5221a53e90c1 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -177,8 +177,10 @@ asmlinkage void do_softirq(void)
  	local_irq_save(flags);
  	pending = local_softirq_pending();
  	/* Switch to interrupt stack */
- 	if (pending)
+ 	if (pending) {
 		call_softirq();
+		WARN_ON_ONCE(softirq_count());
+	}
  	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(do_softirq);
diff --git a/include/asm-x86_64/irqflags.h b/include/asm-x86_64/irqflags.h
new file mode 100644
index 000000000000..22f3c06b247e
--- /dev/null
+++ b/include/asm-x86_64/irqflags.h
@@ -0,0 +1,61 @@
+/*
+ * include/asm-x86_64/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+#ifndef __ASSEMBLY__
+
+/* interrupt control.. */
+#define raw_local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
+#define raw_local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
+
+#ifdef CONFIG_X86_VSMP
+/* Interrupt control for VSMP  architecture */
+#define raw_local_irq_disable()	do { unsigned long flags; raw_local_save_flags(flags); raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); } while (0)
+#define raw_local_irq_enable()	do { unsigned long flags; raw_local_save_flags(flags); raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); } while (0)
+
+#define raw_irqs_disabled_flags(flags)	\
+({						\
+	(flags & (1<<18)) || !(flags & (1<<9));	\
+})
+
+/* For spinlocks etc */
+#define raw_local_irq_save(x)	do { raw_local_save_flags(x); raw_local_irq_restore((x & ~(1 << 9)) | (1 << 18)); } while (0)
+#else  /* CONFIG_X86_VSMP */
+#define raw_local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
+#define raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+
+#define raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & (1<<9));			\
+})
+
+/* For spinlocks etc */
+#define raw_local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# raw_local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+#endif
+
+#define raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	raw_local_save_flags(flags);		\
+	raw_irqs_disabled_flags(flags);		\
+})
+
+/* used in the idle loop; sti takes one instruction cycle to complete */
+#define raw_safe_halt()	__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
+
+#else /* __ASSEMBLY__: */
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+
+#endif
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index 68e559f3631c..f67f2873a922 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -244,43 +244,7 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 
 #define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
 
-/* interrupt control.. */
-#define local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
-#define local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
-
-#ifdef CONFIG_X86_VSMP
-/* Interrupt control for VSMP  architecture */
-#define local_irq_disable()	do { unsigned long flags; local_save_flags(flags); local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); } while (0)
-#define local_irq_enable()	do { unsigned long flags; local_save_flags(flags); local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); } while (0)
-
-#define irqs_disabled()					\
-({							\
-	unsigned long flags;				\
-	local_save_flags(flags);			\
-	(flags & (1<<18)) || !(flags & (1<<9));		\
-})
-
-/* For spinlocks etc */
-#define local_irq_save(x)	do { local_save_flags(x); local_irq_restore((x & ~(1 << 9)) | (1 << 18)); } while (0)
-#else  /* CONFIG_X86_VSMP */
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
-})
-
-/* For spinlocks etc */
-#define local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
-#endif
-
-/* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
+#include <linux/irqflags.h>
 
 void cpu_idle_wait(void);
 
-- 
cgit v1.2.3


From 6375e2b74c620794e1a27a26e4338aec2e41346a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:45 -0700
Subject: [PATCH] lockdep: irqtrace cleanup of include/asm-x86_64/irqflags.h

Clean up the x86-64 irqflags.h file:

 - macro => inline function transformation
 - simplifications
 - style fixes

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/lib/thunk.S       |   5 ++
 include/asm-x86_64/irqflags.h | 156 ++++++++++++++++++++++++++++++++----------
 2 files changed, 123 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
index e49af0032e94..332ea5dff916 100644
--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -47,6 +47,11 @@
 	thunk_retrax __down_failed_interruptible,__down_interruptible
 	thunk_retrax __down_failed_trylock,__down_trylock
 	thunk __up_wakeup,__up
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
+	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+#endif
 	
 	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
diff --git a/include/asm-x86_64/irqflags.h b/include/asm-x86_64/irqflags.h
index 22f3c06b247e..cce6937e87c0 100644
--- a/include/asm-x86_64/irqflags.h
+++ b/include/asm-x86_64/irqflags.h
@@ -5,57 +5,137 @@
  *
  * This file gets included from lowlevel asm headers too, to provide
  * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() macros from the lowlevel headers.
+ * raw_local_irq_*() functions from the lowlevel headers.
  */
 #ifndef _ASM_IRQFLAGS_H
 #define _ASM_IRQFLAGS_H
 
 #ifndef __ASSEMBLY__
+/*
+ * Interrupt control:
+ */
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	unsigned long flags;
+
+	__asm__ __volatile__(
+		"# __raw_save_flags\n\t"
+		"pushfq ; popq %q0"
+		: "=g" (flags)
+		: /* no input */
+		: "memory"
+	);
 
-/* interrupt control.. */
-#define raw_local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
-#define raw_local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
+	return flags;
+}
+
+#define raw_local_save_flags(flags) \
+		do { (flags) = __raw_local_save_flags(); } while (0)
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+	__asm__ __volatile__(
+		"pushq %0 ; popfq"
+		: /* no output */
+		:"g" (flags)
+		:"memory", "cc"
+	);
+}
 
 #ifdef CONFIG_X86_VSMP
-/* Interrupt control for VSMP  architecture */
-#define raw_local_irq_disable()	do { unsigned long flags; raw_local_save_flags(flags); raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); } while (0)
-#define raw_local_irq_enable()	do { unsigned long flags; raw_local_save_flags(flags); raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); } while (0)
-
-#define raw_irqs_disabled_flags(flags)	\
-({						\
-	(flags & (1<<18)) || !(flags & (1<<9));	\
-})
-
-/* For spinlocks etc */
-#define raw_local_irq_save(x)	do { raw_local_save_flags(x); raw_local_irq_restore((x & ~(1 << 9)) | (1 << 18)); } while (0)
-#else  /* CONFIG_X86_VSMP */
-#define raw_local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
-
-#define raw_irqs_disabled_flags(flags)	\
-({						\
-	!(flags & (1<<9));			\
-})
-
-/* For spinlocks etc */
-#define raw_local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# raw_local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+
+/*
+ * Interrupt control for the VSMP architecture:
+ */
+
+static inline void raw_local_irq_disable(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
+}
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !(flags & (1<<9)) || (flags & (1 << 18));
+}
+
+#else /* CONFIG_X86_VSMP */
+
+static inline void raw_local_irq_disable(void)
+{
+	__asm__ __volatile__("cli" : : : "memory");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	__asm__ __volatile__("sti" : : : "memory");
+}
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !(flags & (1 << 9));
+}
+
 #endif
 
-#define raw_irqs_disabled()			\
-({						\
-	unsigned long flags;			\
-	raw_local_save_flags(flags);		\
-	raw_irqs_disabled_flags(flags);		\
-})
+/*
+ * For spinlocks, etc.:
+ */
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	raw_local_irq_disable();
+
+	return flags;
+}
 
-/* used in the idle loop; sti takes one instruction cycle to complete */
-#define raw_safe_halt()	__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
+#define raw_local_irq_save(flags) \
+		do { (flags) = __raw_local_irq_save(); } while (0)
+
+static inline int raw_irqs_disabled(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	return raw_irqs_disabled_flags(flags);
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+	__asm__ __volatile__("sti; hlt" : : : "memory");
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+	__asm__ __volatile__("hlt": : :"memory");
+}
 
 #else /* __ASSEMBLY__: */
-# define TRACE_IRQS_ON
-# define TRACE_IRQS_OFF
+# ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk
+# else
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+# endif
 #endif
 
 #endif
-- 
cgit v1.2.3


From 1f194a4c393103ac925001d7e04b05fbb122580d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 3 Jul 2006 00:24:46 -0700
Subject: [PATCH] lockdep: irqtrace subsystem, s390 support

irqtrace support for s390.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig.debug     |  4 ++++
 arch/s390/kernel/entry.S    | 29 +++++++++++++++++++++++++-
 arch/s390/kernel/entry64.S  | 21 +++++++++++++++++++
 arch/s390/kernel/irq.c      |  2 +-
 arch/s390/kernel/process.c  |  1 +
 drivers/s390/char/sclp.c    | 10 +++++----
 drivers/s390/cio/cio.c      |  2 +-
 include/asm-s390/irqflags.h | 50 +++++++++++++++++++++++++++++++++++++++++++++
 include/asm-s390/system.h   | 32 +----------------------------
 9 files changed, 113 insertions(+), 38 deletions(-)
 create mode 100644 include/asm-s390/irqflags.h

(limited to 'include')

diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index f53b6d5300e5..2283933a9a93 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -1,5 +1,9 @@
 menu "Kernel hacking"
 
+config TRACE_IRQFLAGS_SUPPORT
+	bool
+	default y
+
 source "lib/Kconfig.debug"
 
 endmenu
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index d8948c342caf..5b5799ac8f83 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -58,6 +58,21 @@ STACK_SIZE  = 1 << STACK_SHIFT
 
 #define BASED(name) name-system_call(%r13)
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+	.macro	TRACE_IRQS_ON
+	l	%r1,BASED(.Ltrace_irq_on)
+	basr	%r14,%r1
+	.endm
+
+	.macro	TRACE_IRQS_OFF
+	l	%r1,BASED(.Ltrace_irq_off)
+	basr	%r14,%r1
+	.endm
+#else
+#define TRACE_IRQS_ON
+#define TRACE_IRQS_OFF
+#endif
+
 /*
  * Register usage in interrupt handlers:
  *    R9  - pointer to current task structure
@@ -361,6 +376,7 @@ ret_from_fork:
 	st	%r15,SP_R15(%r15)	# store stack pointer for new kthread
 0:	l       %r1,BASED(.Lschedtail)
 	basr    %r14,%r1
+	TRACE_IRQS_ON
         stosm   __SF_EMPTY(%r15),0x03     # reenable interrupts
 	b	BASED(sysc_return)
 
@@ -516,6 +532,7 @@ pgm_no_vtime3:
 	mvc	__THREAD_per+__PER_address(4,%r1),__LC_PER_ADDRESS
 	mvc	__THREAD_per+__PER_access_id(1,%r1),__LC_PER_ACCESS_ID
 	oi	__TI_flags+3(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP
+	TRACE_IRQS_ON
 	stosm	__SF_EMPTY(%r15),0x03	# reenable interrupts
 	b	BASED(sysc_do_svc)
 
@@ -539,9 +556,11 @@ io_int_handler:
 io_no_vtime:
 #endif
 	l	%r9,__LC_THREAD_INFO	# load pointer to thread_info struct
+	TRACE_IRQS_OFF
         l       %r1,BASED(.Ldo_IRQ)        # load address of do_IRQ
         la      %r2,SP_PTREGS(%r15) # address of register-save area
         basr    %r14,%r1          # branch to standard irq handler
+	TRACE_IRQS_ON
 
 io_return:
         tm      SP_PSW+1(%r15),0x01    # returning to user ?
@@ -651,10 +670,12 @@ ext_int_handler:
 ext_no_vtime:
 #endif
 	l	%r9,__LC_THREAD_INFO	# load pointer to thread_info struct
+	TRACE_IRQS_OFF
 	la	%r2,SP_PTREGS(%r15)    # address of register-save area
 	lh	%r3,__LC_EXT_INT_CODE  # get interruption code
 	l	%r1,BASED(.Ldo_extint)
 	basr	%r14,%r1
+	TRACE_IRQS_ON
 	b	BASED(io_return)
 
 __critical_end:
@@ -731,8 +752,10 @@ mcck_no_vtime:
 	stosm	__SF_EMPTY(%r15),0x04	# turn dat on
 	tm	__TI_flags+3(%r9),_TIF_MCCK_PENDING
 	bno	BASED(mcck_return)
+	TRACE_IRQS_OFF
 	l	%r1,BASED(.Ls390_handle_mcck)
 	basr	%r14,%r1		# call machine check handler
+	TRACE_IRQS_ON
 mcck_return:
 	mvc	__LC_RETURN_MCCK_PSW(8),SP_PSW(%r15) # move return PSW
 	ni	__LC_RETURN_MCCK_PSW+1,0xfd # clear wait state bit
@@ -1012,7 +1035,11 @@ cleanup_io_leave_insn:
 .Lvfork:       .long  sys_vfork
 .Lschedtail:   .long  schedule_tail
 .Lsysc_table:  .long  sys_call_table
-
+#ifdef CONFIG_TRACE_IRQFLAGS
+.Ltrace_irq_on:.long  trace_hardirqs_on
+.Ltrace_irq_off:
+	       .long  trace_hardirqs_off
+#endif
 .Lcritical_start:
                .long  __critical_start + 0x80000000
 .Lcritical_end:
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 1ca499fa54b4..56f5f613b868 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -58,6 +58,19 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK | _TIF_NEED_RESCHED | \
 
 #define BASED(name) name-system_call(%r13)
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+	.macro	TRACE_IRQS_ON
+	 brasl	%r14,trace_hardirqs_on
+	.endm
+
+	.macro	TRACE_IRQS_OFF
+	 brasl	%r14,trace_hardirqs_off
+	.endm
+#else
+#define TRACE_IRQS_ON
+#define TRACE_IRQS_OFF
+#endif
+
 	.macro  STORE_TIMER lc_offset
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 	stpt	\lc_offset
@@ -354,6 +367,7 @@ ret_from_fork:
 	jo	0f
 	stg	%r15,SP_R15(%r15)	# store stack pointer for new kthread
 0:	brasl   %r14,schedule_tail
+	TRACE_IRQS_ON
         stosm   24(%r15),0x03     # reenable interrupts
 	j	sysc_return
 
@@ -535,6 +549,7 @@ pgm_no_vtime3:
 	mvc	__THREAD_per+__PER_address(8,%r1),__LC_PER_ADDRESS
 	mvc	__THREAD_per+__PER_access_id(1,%r1),__LC_PER_ACCESS_ID
 	oi	__TI_flags+7(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP
+	TRACE_IRQS_ON
 	stosm	__SF_EMPTY(%r15),0x03	# reenable interrupts
 	j	sysc_do_svc
 
@@ -557,8 +572,10 @@ io_int_handler:
 io_no_vtime:
 #endif
 	lg	%r9,__LC_THREAD_INFO	# load pointer to thread_info struct
+	TRACE_IRQS_OFF
         la      %r2,SP_PTREGS(%r15)    # address of register-save area
 	brasl   %r14,do_IRQ            # call standard irq handler
+	TRACE_IRQS_ON
 
 io_return:
         tm      SP_PSW+1(%r15),0x01    # returning to user ?
@@ -665,9 +682,11 @@ ext_int_handler:
 ext_no_vtime:
 #endif
 	lg	%r9,__LC_THREAD_INFO	# load pointer to thread_info struct
+	TRACE_IRQS_OFF
 	la	%r2,SP_PTREGS(%r15)    # address of register-save area
 	llgh	%r3,__LC_EXT_INT_CODE  # get interruption code
 	brasl   %r14,do_extint
+	TRACE_IRQS_ON
 	j	io_return
 
 __critical_end:
@@ -743,7 +762,9 @@ mcck_no_vtime:
 	stosm	__SF_EMPTY(%r15),0x04	# turn dat on
 	tm	__TI_flags+7(%r9),_TIF_MCCK_PENDING
 	jno	mcck_return
+	TRACE_IRQS_OFF
 	brasl	%r14,s390_handle_mcck
+	TRACE_IRQS_ON
 mcck_return:
 	mvc	__LC_RETURN_MCCK_PSW(16),SP_PSW(%r15) # move return PSW
 	ni	__LC_RETURN_MCCK_PSW+1,0xfd # clear wait state bit
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 480b6a5fef3a..e347190d9aea 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -97,7 +97,7 @@ asmlinkage void do_softirq(void)
 
 	account_system_vtime(current);
 
-	__local_bh_enable();
+	_local_bh_enable();
 
 	local_irq_restore(flags);
 }
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 1f9399191794..78c8e5548caf 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -142,6 +142,7 @@ static void default_idle(void)
 		return;
 	}
 
+	trace_hardirqs_on();
 	/* Wait for external, I/O or machine check interrupt. */
 	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_WAIT |
 			PSW_MASK_IO | PSW_MASK_EXT);
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index 4138564402b8..985d1613baaa 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -383,6 +383,7 @@ void
 sclp_sync_wait(void)
 {
 	unsigned long psw_mask;
+	unsigned long flags;
 	unsigned long cr0, cr0_sync;
 	u64 timeout;
 
@@ -395,9 +396,11 @@ sclp_sync_wait(void)
 			  sclp_tod_from_jiffies(sclp_request_timer.expires -
 						jiffies);
 	}
+	local_irq_save(flags);
 	/* Prevent bottom half from executing once we force interrupts open */
 	local_bh_disable();
 	/* Enable service-signal interruption, disable timer interrupts */
+	trace_hardirqs_on();
 	__ctl_store(cr0, 0, 0);
 	cr0_sync = cr0;
 	cr0_sync |= 0x00000200;
@@ -415,11 +418,10 @@ sclp_sync_wait(void)
 		barrier();
 		cpu_relax();
 	}
-	/* Restore interrupt settings */
-	asm volatile ("SSM 0(%0)"
-		      : : "a" (&psw_mask) : "memory");
+	local_irq_disable();
 	__ctl_load(cr0, 0, 0);
-	__local_bh_enable();
+	_local_bh_enable();
+	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(sclp_sync_wait);
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index a3423267467f..6fec90eab00e 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -147,7 +147,7 @@ cio_tpi(void)
 		sch->driver->irq(&sch->dev);
 	spin_unlock(&sch->lock);
 	irq_exit ();
-	__local_bh_enable();
+	_local_bh_enable();
 	return 1;
 }
 
diff --git a/include/asm-s390/irqflags.h b/include/asm-s390/irqflags.h
new file mode 100644
index 000000000000..65f4db627e7a
--- /dev/null
+++ b/include/asm-s390/irqflags.h
@@ -0,0 +1,50 @@
+/*
+ *  include/asm-s390/irqflags.h
+ *
+ *    Copyright (C) IBM Corp. 2006
+ *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
+ */
+
+#ifndef __ASM_IRQFLAGS_H
+#define __ASM_IRQFLAGS_H
+
+#ifdef __KERNEL__
+
+/* interrupt control.. */
+#define raw_local_irq_enable() ({ \
+	unsigned long  __dummy; \
+	__asm__ __volatile__ ( \
+		"stosm 0(%1),0x03" \
+		: "=m" (__dummy) : "a" (&__dummy) : "memory" ); \
+	})
+
+#define raw_local_irq_disable() ({ \
+	unsigned long __flags; \
+	__asm__ __volatile__ ( \
+		"stnsm 0(%1),0xfc" : "=m" (__flags) : "a" (&__flags) ); \
+	__flags; \
+	})
+
+#define raw_local_save_flags(x) \
+	__asm__ __volatile__("stosm 0(%1),0" : "=m" (x) : "a" (&x), "m" (x) )
+
+#define raw_local_irq_restore(x) \
+	__asm__ __volatile__("ssm   0(%0)" : : "a" (&x), "m" (x) : "memory")
+
+#define raw_irqs_disabled()		\
+({					\
+	unsigned long flags;		\
+	local_save_flags(flags);	\
+	!((flags >> __FLAG_SHIFT) & 3);	\
+})
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !((flags >> __FLAG_SHIFT) & 3);
+}
+
+/* For spinlocks etc */
+#define raw_local_irq_save(x)	((x) = raw_local_irq_disable())
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_IRQFLAGS_H */
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index 71a0732cd518..9ab186ffde23 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -301,34 +301,6 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
 #define set_mb(var, value)      do { var = value; mb(); } while (0)
 #define set_wmb(var, value)     do { var = value; wmb(); } while (0)
 
-/* interrupt control.. */
-#define local_irq_enable() ({ \
-        unsigned long  __dummy; \
-        __asm__ __volatile__ ( \
-                "stosm 0(%1),0x03" \
-		: "=m" (__dummy) : "a" (&__dummy) : "memory" ); \
-        })
-
-#define local_irq_disable() ({ \
-        unsigned long __flags; \
-        __asm__ __volatile__ ( \
-                "stnsm 0(%1),0xfc" : "=m" (__flags) : "a" (&__flags) ); \
-        __flags; \
-        })
-
-#define local_save_flags(x) \
-        __asm__ __volatile__("stosm 0(%1),0" : "=m" (x) : "a" (&x), "m" (x) )
-
-#define local_irq_restore(x) \
-        __asm__ __volatile__("ssm   0(%0)" : : "a" (&x), "m" (x) : "memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-        !((flags >> __FLAG_SHIFT) & 3);	\
-})
-
 #ifdef __s390x__
 
 #define __ctl_load(array, low, high) ({ \
@@ -442,8 +414,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
         })
 #endif /* __s390x__ */
 
-/* For spinlocks etc */
-#define local_irq_save(x)	((x) = local_irq_disable())
+#include <linux/irqflags.h>
 
 /*
  * Use to set psw mask except for the first byte which
@@ -482,4 +453,3 @@ extern void (*_machine_power_off)(void);
 #endif /* __KERNEL__ */
 
 #endif
-
-- 
cgit v1.2.3


From fbb9ce9530fd9b66096d5187fa6a115d16d9746c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:50 -0700
Subject: [PATCH] lockdep: core

Do 'make oldconfig' and accept all the defaults for new config options -
reboot into the kernel and if everything goes well it should boot up fine and
you should have /proc/lockdep and /proc/lockdep_stats files.

Typically if the lock validator finds some problem it will print out
voluminous debug output that begins with "BUG: ..." and which syslog output
can be used by kernel developers to figure out the precise locking scenario.

What does the lock validator do?  It "observes" and maps all locking rules as
they occur dynamically (as triggered by the kernel's natural use of spinlocks,
rwlocks, mutexes and rwsems).  Whenever the lock validator subsystem detects a
new locking scenario, it validates this new rule against the existing set of
rules.  If this new rule is consistent with the existing set of rules then the
new rule is added transparently and the kernel continues as normal.  If the
new rule could create a deadlock scenario then this condition is printed out.

When determining validity of locking, all possible "deadlock scenarios" are
considered: assuming arbitrary number of CPUs, arbitrary irq context and task
context constellations, running arbitrary combinations of all the existing
locking scenarios.  In a typical system this means millions of separate
scenarios.  This is why we call it a "locking correctness" validator - for all
rules that are observed the lock validator proves it with mathematical
certainty that a deadlock could not occur (assuming that the lock validator
implementation itself is correct and its internal data structures are not
corrupted by some other kernel subsystem).  [see more details and conditionals
of this statement in include/linux/lockdep.h and
Documentation/lockdep-design.txt]

Furthermore, this "all possible scenarios" property of the validator also
enables the finding of complex, highly unlikely multi-CPU multi-context races
via single single-context rules, increasing the likelyhood of finding bugs
drastically.  In practical terms: the lock validator already found a bug in
the upstream kernel that could only occur on systems with 3 or more CPUs, and
which needed 3 very unlikely code sequences to occur at once on the 3 CPUs.
That bug was found and reported on a single-CPU system (!).  So in essence a
race will be found "piecemail-wise", triggering all the necessary components
for the race, without having to reproduce the race scenario itself!  In its
short existence the lock validator found and reported many bugs before they
actually caused a real deadlock.

To further increase the efficiency of the validator, the mapping is not per
"lock instance", but per "lock-class".  For example, all struct inode objects
in the kernel have inode->inotify_mutex.  If there are 10,000 inodes cached,
then there are 10,000 lock objects.  But ->inotify_mutex is a single "lock
type", and all locking activities that occur against ->inotify_mutex are
"unified" into this single lock-class.  The advantage of the lock-class
approach is that all historical ->inotify_mutex uses are mapped into a single
(and as narrow as possible) set of locking rules - regardless of how many
different tasks or inode structures it took to build this set of rules.  The
set of rules persist during the lifetime of the kernel.

To see the rough magnitude of checking that the lock validator does, here's a
portion of /proc/lockdep_stats, fresh after bootup:

 lock-classes:                            694 [max: 2048]
 direct dependencies:                  1598 [max: 8192]
 indirect dependencies:               17896
 all direct dependencies:             16206
 dependency chains:                    1910 [max: 8192]
 in-hardirq chains:                      17
 in-softirq chains:                     105
 in-process chains:                    1065
 stack-trace entries:                 38761 [max: 131072]
 combined max dependencies:         2033928
 hardirq-safe locks:                     24
 hardirq-unsafe locks:                  176
 softirq-safe locks:                     53
 softirq-unsafe locks:                  137
 irq-safe locks:                         59
 irq-unsafe locks:                      176

The lock validator has observed 1598 actual single-thread locking patterns,
and has validated all possible 2033928 distinct locking scenarios.

More details about the design of the lock validator can be found in
Documentation/lockdep-design.txt, which can also found at:

   http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt

[bunk@stusta.de: cleanups]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hardirq.h    |    5 +-
 include/linux/init_task.h  |    2 +
 include/linux/lockdep.h    |  347 ++++++
 include/linux/sched.h      |    7 +
 init/main.c                |   14 +
 kernel/Makefile            |    1 +
 kernel/fork.c              |    5 +
 kernel/irq/manage.c        |    6 +
 kernel/lockdep.c           | 2703 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/lockdep_internals.h |   78 ++
 kernel/module.c            |    3 +
 lib/Kconfig.debug          |    2 +-
 lib/locking-selftest.c     |    4 +-
 13 files changed, 3171 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/lockdep.h
 create mode 100644 kernel/lockdep.c
 create mode 100644 kernel/lockdep_internals.h

(limited to 'include')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index b1d4332b5cf0..50d8b5744cf6 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -3,6 +3,7 @@
 
 #include <linux/preempt.h>
 #include <linux/smp_lock.h>
+#include <linux/lockdep.h>
 #include <asm/hardirq.h>
 #include <asm/system.h>
 
@@ -122,7 +123,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		irq_enter()
-#define nmi_exit()		__irq_exit()
+#define nmi_enter()		do { lockdep_off(); irq_enter(); } while (0)
+#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 444a3ae0de2a..60aac2cea0cf 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -4,6 +4,7 @@
 #include <linux/file.h>
 #include <linux/rcupdate.h>
 #include <linux/irqflags.h>
+#include <linux/lockdep.h>
 
 #define INIT_FDTABLE \
 {							\
@@ -126,6 +127,7 @@ extern struct group_info init_groups;
 	.fs_excl	= ATOMIC_INIT(0),				\
 	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
 	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
 }
 
 
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
new file mode 100644
index 000000000000..80ec7a4dbc98
--- /dev/null
+++ b/include/linux/lockdep.h
@@ -0,0 +1,347 @@
+/*
+ * Runtime locking correctness validator
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * see Documentation/lockdep-design.txt for more details.
+ */
+#ifndef __LINUX_LOCKDEP_H
+#define __LINUX_LOCKDEP_H
+
+#include <linux/linkage.h>
+#include <linux/list.h>
+#include <linux/debug_locks.h>
+#include <linux/stacktrace.h>
+
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit
+{
+	LOCK_USED = 0,
+	LOCK_USED_IN_HARDIRQ,
+	LOCK_USED_IN_SOFTIRQ,
+	LOCK_ENABLED_SOFTIRQS,
+	LOCK_ENABLED_HARDIRQS,
+	LOCK_USED_IN_HARDIRQ_READ,
+	LOCK_USED_IN_SOFTIRQ_READ,
+	LOCK_ENABLED_SOFTIRQS_READ,
+	LOCK_ENABLED_HARDIRQS_READ,
+	LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define LOCKF_USED			(1 << LOCK_USED)
+#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
+#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
+#define LOCKF_ENABLED_HARDIRQS		(1 << LOCK_ENABLED_HARDIRQS)
+#define LOCKF_ENABLED_SOFTIRQS		(1 << LOCK_ENABLED_SOFTIRQS)
+
+#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
+#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
+#define LOCKF_ENABLED_HARDIRQS_READ	(1 << LOCK_ENABLED_HARDIRQS_READ)
+#define LOCKF_ENABLED_SOFTIRQS_READ	(1 << LOCK_ENABLED_SOFTIRQS_READ)
+
+#define LOCKF_ENABLED_IRQS_READ \
+		(LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
+#define MAX_LOCKDEP_SUBCLASSES		8UL
+
+/*
+ * Lock-classes are keyed via unique addresses, by embedding the
+ * lockclass-key into the kernel (or module) .data section. (For
+ * static locks we use the lock address itself as the key.)
+ */
+struct lockdep_subclass_key {
+	char __one_byte;
+} __attribute__ ((__packed__));
+
+struct lock_class_key {
+	struct lockdep_subclass_key	subkeys[MAX_LOCKDEP_SUBCLASSES];
+};
+
+/*
+ * The lock-class itself:
+ */
+struct lock_class {
+	/*
+	 * class-hash:
+	 */
+	struct list_head		hash_entry;
+
+	/*
+	 * global list of all lock-classes:
+	 */
+	struct list_head		lock_entry;
+
+	struct lockdep_subclass_key	*key;
+	unsigned int			subclass;
+
+	/*
+	 * IRQ/softirq usage tracking bits:
+	 */
+	unsigned long			usage_mask;
+	struct stack_trace		usage_traces[LOCK_USAGE_STATES];
+
+	/*
+	 * These fields represent a directed graph of lock dependencies,
+	 * to every node we attach a list of "forward" and a list of
+	 * "backward" graph nodes.
+	 */
+	struct list_head		locks_after, locks_before;
+
+	/*
+	 * Generation counter, when doing certain classes of graph walking,
+	 * to ensure that we check one node only once:
+	 */
+	unsigned int			version;
+
+	/*
+	 * Statistics counter:
+	 */
+	unsigned long			ops;
+
+	const char			*name;
+	int				name_version;
+};
+
+/*
+ * Map the lock object (the lock instance) to the lock-class object.
+ * This is embedded into specific lock instances:
+ */
+struct lockdep_map {
+	struct lock_class_key		*key;
+	struct lock_class		*class[MAX_LOCKDEP_SUBCLASSES];
+	const char			*name;
+};
+
+/*
+ * Every lock has a list of other locks that were taken after it.
+ * We only grow the list, never remove from it:
+ */
+struct lock_list {
+	struct list_head		entry;
+	struct lock_class		*class;
+	struct stack_trace		trace;
+};
+
+/*
+ * We record lock dependency chains, so that we can cache them:
+ */
+struct lock_chain {
+	struct list_head		entry;
+	u64				chain_key;
+};
+
+struct held_lock {
+	/*
+	 * One-way hash of the dependency chain up to this point. We
+	 * hash the hashes step by step as the dependency chain grows.
+	 *
+	 * We use it for dependency-caching and we skip detection
+	 * passes and dependency-updates if there is a cache-hit, so
+	 * it is absolutely critical for 100% coverage of the validator
+	 * to have a unique key value for every unique dependency path
+	 * that can occur in the system, to make a unique hash value
+	 * as likely as possible - hence the 64-bit width.
+	 *
+	 * The task struct holds the current hash value (initialized
+	 * with zero), here we store the previous hash value:
+	 */
+	u64				prev_chain_key;
+	struct lock_class		*class;
+	unsigned long			acquire_ip;
+	struct lockdep_map		*instance;
+
+	/*
+	 * The lock-stack is unified in that the lock chains of interrupt
+	 * contexts nest ontop of process context chains, but we 'separate'
+	 * the hashes by starting with 0 if we cross into an interrupt
+	 * context, and we also keep do not add cross-context lock
+	 * dependencies - the lock usage graph walking covers that area
+	 * anyway, and we'd just unnecessarily increase the number of
+	 * dependencies otherwise. [Note: hardirq and softirq contexts
+	 * are separated from each other too.]
+	 *
+	 * The following field is used to detect when we cross into an
+	 * interrupt context:
+	 */
+	int				irq_context;
+	int				trylock;
+	int				read;
+	int				check;
+	int				hardirqs_off;
+};
+
+/*
+ * Initialization, self-test and debugging-output methods:
+ */
+extern void lockdep_init(void);
+extern void lockdep_info(void);
+extern void lockdep_reset(void);
+extern void lockdep_reset_lock(struct lockdep_map *lock);
+extern void lockdep_free_key_range(void *start, unsigned long size);
+
+extern void lockdep_off(void);
+extern void lockdep_on(void);
+extern int lockdep_internal(void);
+
+/*
+ * These methods are used by specific locking variants (spinlocks,
+ * rwlocks, mutexes and rwsems) to pass init/acquire/release events
+ * to lockdep:
+ */
+
+extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
+			     struct lock_class_key *key);
+
+/*
+ * Reinitialize a lock key - for cases where there is special locking or
+ * special initialization of locks so that the validator gets the scope
+ * of dependencies wrong: they are either too broad (they need a class-split)
+ * or they are too narrow (they suffer from a false class-split):
+ */
+#define lockdep_set_class(lock, key) \
+		lockdep_init_map(&(lock)->dep_map, #key, key)
+#define lockdep_set_class_and_name(lock, key, name) \
+		lockdep_init_map(&(lock)->dep_map, name, key)
+
+/*
+ * Acquire a lock.
+ *
+ * Values for "read":
+ *
+ *   0: exclusive (write) acquire
+ *   1: read-acquire (no recursion allowed)
+ *   2: read-acquire with same-instance recursion allowed
+ *
+ * Values for check:
+ *
+ *   0: disabled
+ *   1: simple checks (freeing, held-at-exit-time, etc.)
+ *   2: full validation
+ */
+extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+			 int trylock, int read, int check, unsigned long ip);
+
+extern void lock_release(struct lockdep_map *lock, int nested,
+			 unsigned long ip);
+
+# define INIT_LOCKDEP				.lockdep_recursion = 0,
+
+#else /* !LOCKDEP */
+
+static inline void lockdep_off(void)
+{
+}
+
+static inline void lockdep_on(void)
+{
+}
+
+static inline int lockdep_internal(void)
+{
+	return 0;
+}
+
+# define lock_acquire(l, s, t, r, c, i)		do { } while (0)
+# define lock_release(l, n, i)			do { } while (0)
+# define lockdep_init()				do { } while (0)
+# define lockdep_info()				do { } while (0)
+# define lockdep_init_map(lock, name, key)	do { (void)(key); } while (0)
+# define lockdep_set_class(lock, key)		do { (void)(key); } while (0)
+# define lockdep_set_class_and_name(lock, key, name) \
+		do { (void)(key); } while (0)
+# define INIT_LOCKDEP
+# define lockdep_reset()		do { debug_locks = 1; } while (0)
+# define lockdep_free_key_range(start, size)	do { } while (0)
+/*
+ * The class key takes no space if lockdep is disabled:
+ */
+struct lock_class_key { };
+#endif /* !LOCKDEP */
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+extern void early_boot_irqs_off(void);
+extern void early_boot_irqs_on(void);
+#else
+# define early_boot_irqs_off()			do { } while (0)
+# define early_boot_irqs_on()			do { } while (0)
+#endif
+
+/*
+ * For trivial one-depth nesting of a lock-class, the following
+ * global define can be used. (Subsystems with multiple levels
+ * of nesting should define their own lock-nesting subclasses.)
+ */
+#define SINGLE_DEPTH_NESTING			1
+
+/*
+ * Map the dependency ops to NOP or to real lockdep ops, depending
+ * on the per lock-class debug mode:
+ */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+#  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
+# else
+#  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
+# endif
+# define spin_release(l, n, i)			lock_release(l, n, i)
+#else
+# define spin_acquire(l, s, t, i)		do { } while (0)
+# define spin_release(l, n, i)			do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+#  define rwlock_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
+#  define rwlock_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 2, 2, i)
+# else
+#  define rwlock_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
+#  define rwlock_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 2, 1, i)
+# endif
+# define rwlock_release(l, n, i)		lock_release(l, n, i)
+#else
+# define rwlock_acquire(l, s, t, i)		do { } while (0)
+# define rwlock_acquire_read(l, s, t, i)	do { } while (0)
+# define rwlock_release(l, n, i)		do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+#  define mutex_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
+# else
+#  define mutex_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
+# endif
+# define mutex_release(l, n, i)			lock_release(l, n, i)
+#else
+# define mutex_acquire(l, s, t, i)		do { } while (0)
+# define mutex_release(l, n, i)			do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+#  define rwsem_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
+#  define rwsem_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 1, 2, i)
+# else
+#  define rwsem_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
+#  define rwsem_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 1, 1, i)
+# endif
+# define rwsem_release(l, n, i)			lock_release(l, n, i)
+#else
+# define rwsem_acquire(l, s, t, i)		do { } while (0)
+# define rwsem_acquire_read(l, s, t, i)		do { } while (0)
+# define rwsem_release(l, n, i)			do { } while (0)
+#endif
+
+#endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad7a89014d29..8ebddba4448d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -886,6 +886,13 @@ struct task_struct {
 	int hardirq_context;
 	int softirq_context;
 #endif
+#ifdef CONFIG_LOCKDEP
+# define MAX_LOCK_DEPTH 30UL
+	u64 curr_chain_key;
+	int lockdep_depth;
+	struct held_lock held_locks[MAX_LOCK_DEPTH];
+	unsigned int lockdep_recursion;
+#endif
 
 /* journalling filesystem info */
 	void *journal_info;
diff --git a/init/main.c b/init/main.c
index fc73e1cd8614..fc473d4b56fd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -48,6 +48,7 @@
 #include <linux/unwind.h>
 #include <linux/buffer_head.h>
 #include <linux/debug_locks.h>
+#include <linux/lockdep.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -457,6 +458,15 @@ asmlinkage void __init start_kernel(void)
 
 	smp_setup_processor_id();
 
+	/*
+	 * Need to run as early as possible, to initialize the
+	 * lockdep hash:
+	 */
+	lockdep_init();
+
+	local_irq_disable();
+	early_boot_irqs_off();
+
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
@@ -502,6 +512,7 @@ asmlinkage void __init start_kernel(void)
 	profile_init();
 	if (!irqs_disabled())
 		printk("start_kernel(): bug: interrupts were enabled early\n");
+	early_boot_irqs_on();
 	local_irq_enable();
 
 	/*
@@ -512,6 +523,9 @@ asmlinkage void __init start_kernel(void)
 	console_init();
 	if (panic_later)
 		panic(panic_later, panic_param);
+
+	lockdep_info();
+
 	/*
 	 * Need to run this when irqs are enabled, because it wants
 	 * to self-test [hard/soft]-irqs on/off lock inversion bugs
diff --git a/kernel/Makefile b/kernel/Makefile
index e7fd20e70f1b..049aa79e5c1c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,6 +13,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
 obj-$(CONFIG_FUTEX) += futex.o
 ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
diff --git a/kernel/fork.c b/kernel/fork.c
index b7db7fb74f53..7f48abdd7bb6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,6 +1061,11 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+#ifdef CONFIG_LOCKDEP
+	p->lockdep_depth = 0; /* no locks held yet */
+	p->curr_chain_key = 0;
+	p->lockdep_recursion = 0;
+#endif
 
 	rt_mutex_init_task(p);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c911c6ec4dd6..4e461438e48b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -410,6 +410,12 @@ int request_irq(unsigned int irq,
 	struct irqaction *action;
 	int retval;
 
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * Lockdep wants atomic interrupt handlers:
+	 */
+	irqflags |= SA_INTERRUPT;
+#endif
 	/*
 	 * Sanity-check: shared interrupts must pass in a real dev-ID,
 	 * otherwise we'll have trouble later trying to figure out
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
new file mode 100644
index 000000000000..dd0580910a97
--- /dev/null
+++ b/kernel/lockdep.c
@@ -0,0 +1,2703 @@
+/*
+ * kernel/lockdep.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * this code maps all the lock dependencies as they occur in a live kernel
+ * and will warn about the following classes of locking bugs:
+ *
+ * - lock inversion scenarios
+ * - circular lock dependencies
+ * - hardirq/softirq safe/unsafe locking bugs
+ *
+ * Bugs are reported even if the current locking scenario does not cause
+ * any deadlock at this point.
+ *
+ * I.e. if anytime in the past two locks were taken in a different order,
+ * even if it happened for another task, even if those were different
+ * locks (but of the same class as this lock), this code will detect it.
+ *
+ * Thanks to Arjan van de Ven for coming up with the initial idea of
+ * mapping lock dependencies runtime.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/stacktrace.h>
+#include <linux/debug_locks.h>
+#include <linux/irqflags.h>
+
+#include <asm/sections.h>
+
+#include "lockdep_internals.h"
+
+/*
+ * hash_lock: protects the lockdep hashes and class/list/hash allocators.
+ *
+ * This is one of the rare exceptions where it's justified
+ * to use a raw spinlock - we really dont want the spinlock
+ * code to recurse back into the lockdep code.
+ */
+static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static int lockdep_initialized;
+
+unsigned long nr_list_entries;
+static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+
+/*
+ * Allocate a lockdep entry. (assumes hash_lock held, returns
+ * with NULL on failure)
+ */
+static struct lock_list *alloc_list_entry(void)
+{
+	if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+		__raw_spin_unlock(&hash_lock);
+		debug_locks_off();
+		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
+		printk("turning off the locking correctness validator.\n");
+		return NULL;
+	}
+	return list_entries + nr_list_entries++;
+}
+
+/*
+ * All data structures here are protected by the global debug_lock.
+ *
+ * Mutex key structs only get allocated, once during bootup, and never
+ * get freed - this significantly simplifies the debugging code.
+ */
+unsigned long nr_lock_classes;
+static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+
+/*
+ * We keep a global list of all lock classes. The list only grows,
+ * never shrinks. The list is only accessed with the lockdep
+ * spinlock lock held.
+ */
+LIST_HEAD(all_lock_classes);
+
+/*
+ * The lockdep classes are in a hash-table as well, for fast lookup:
+ */
+#define CLASSHASH_BITS		(MAX_LOCKDEP_KEYS_BITS - 1)
+#define CLASSHASH_SIZE		(1UL << CLASSHASH_BITS)
+#define CLASSHASH_MASK		(CLASSHASH_SIZE - 1)
+#define __classhashfn(key)	((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
+#define classhashentry(key)	(classhash_table + __classhashfn((key)))
+
+static struct list_head classhash_table[CLASSHASH_SIZE];
+
+unsigned long nr_lock_chains;
+static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+
+/*
+ * We put the lock dependency chains into a hash-table as well, to cache
+ * their existence:
+ */
+#define CHAINHASH_BITS		(MAX_LOCKDEP_CHAINS_BITS-1)
+#define CHAINHASH_SIZE		(1UL << CHAINHASH_BITS)
+#define CHAINHASH_MASK		(CHAINHASH_SIZE - 1)
+#define __chainhashfn(chain) \
+		(((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
+#define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain)))
+
+static struct list_head chainhash_table[CHAINHASH_SIZE];
+
+/*
+ * The hash key of the lock dependency chains is a hash itself too:
+ * it's a hash of all locks taken up to that lock, including that lock.
+ * It's a 64-bit hash, because it's important for the keys to be
+ * unique.
+ */
+#define iterate_chain_key(key1, key2) \
+	(((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
+	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+	(key2))
+
+void lockdep_off(void)
+{
+	current->lockdep_recursion++;
+}
+
+EXPORT_SYMBOL(lockdep_off);
+
+void lockdep_on(void)
+{
+	current->lockdep_recursion--;
+}
+
+EXPORT_SYMBOL(lockdep_on);
+
+int lockdep_internal(void)
+{
+	return current->lockdep_recursion != 0;
+}
+
+EXPORT_SYMBOL(lockdep_internal);
+
+/*
+ * Debugging switches:
+ */
+
+#define VERBOSE			0
+#ifdef VERBOSE
+# define VERY_VERBOSE		0
+#endif
+
+#if VERBOSE
+# define HARDIRQ_VERBOSE	1
+# define SOFTIRQ_VERBOSE	1
+#else
+# define HARDIRQ_VERBOSE	0
+# define SOFTIRQ_VERBOSE	0
+#endif
+
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+/*
+ * Quick filtering for interesting events:
+ */
+static int class_filter(struct lock_class *class)
+{
+	if (class->name_version == 1 &&
+			!strcmp(class->name, "&rl->lock"))
+		return 1;
+	if (class->name_version == 1 &&
+			!strcmp(class->name, "&ni->mrec_lock"))
+		return 1;
+	if (class->name_version == 1 &&
+			!strcmp(class->name, "mft_ni_runlist_lock"))
+		return 1;
+	if (class->name_version == 1 &&
+			!strcmp(class->name, "mft_ni_mrec_lock"))
+		return 1;
+	if (class->name_version == 1 &&
+			!strcmp(class->name, "&vol->lcnbmp_lock"))
+		return 1;
+	return 0;
+}
+#endif
+
+static int verbose(struct lock_class *class)
+{
+#if VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+static int hardirq_verbose(struct lock_class *class)
+{
+#if HARDIRQ_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+
+static int softirq_verbose(struct lock_class *class)
+{
+#if SOFTIRQ_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+
+#endif
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+unsigned long nr_stack_trace_entries;
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
+static int save_trace(struct stack_trace *trace)
+{
+	trace->nr_entries = 0;
+	trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+	trace->entries = stack_trace + nr_stack_trace_entries;
+
+	save_stack_trace(trace, NULL, 0, 3);
+
+	trace->max_entries = trace->nr_entries;
+
+	nr_stack_trace_entries += trace->nr_entries;
+	if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
+		return 0;
+
+	if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+		__raw_spin_unlock(&hash_lock);
+		if (debug_locks_off()) {
+			printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
+			printk("turning off the locking correctness validator.\n");
+			dump_stack();
+		}
+		return 0;
+	}
+
+	return 1;
+}
+
+unsigned int nr_hardirq_chains;
+unsigned int nr_softirq_chains;
+unsigned int nr_process_chains;
+unsigned int max_lockdep_depth;
+unsigned int max_recursion_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * We cannot printk in early bootup code. Not even early_printk()
+ * might work. So we mark any initialization errors and printk
+ * about it later on, in lockdep_info().
+ */
+static int lockdep_init_error;
+
+/*
+ * Various lockdep statistics:
+ */
+atomic_t chain_lookup_hits;
+atomic_t chain_lookup_misses;
+atomic_t hardirqs_on_events;
+atomic_t hardirqs_off_events;
+atomic_t redundant_hardirqs_on;
+atomic_t redundant_hardirqs_off;
+atomic_t softirqs_on_events;
+atomic_t softirqs_off_events;
+atomic_t redundant_softirqs_on;
+atomic_t redundant_softirqs_off;
+atomic_t nr_unused_locks;
+atomic_t nr_cyclic_checks;
+atomic_t nr_cyclic_check_recursions;
+atomic_t nr_find_usage_forwards_checks;
+atomic_t nr_find_usage_forwards_recursions;
+atomic_t nr_find_usage_backwards_checks;
+atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr)		atomic_inc(ptr)
+# define debug_atomic_dec(ptr)		atomic_dec(ptr)
+# define debug_atomic_read(ptr)		atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr)		do { } while (0)
+# define debug_atomic_dec(ptr)		do { } while (0)
+# define debug_atomic_read(ptr)		0
+#endif
+
+/*
+ * Locking printouts:
+ */
+
+static const char *usage_str[] =
+{
+	[LOCK_USED] =			"initial-use ",
+	[LOCK_USED_IN_HARDIRQ] =	"in-hardirq-W",
+	[LOCK_USED_IN_SOFTIRQ] =	"in-softirq-W",
+	[LOCK_ENABLED_SOFTIRQS] =	"softirq-on-W",
+	[LOCK_ENABLED_HARDIRQS] =	"hardirq-on-W",
+	[LOCK_USED_IN_HARDIRQ_READ] =	"in-hardirq-R",
+	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
+	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R",
+	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R",
+};
+
+const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+{
+	unsigned long offs, size;
+	char *modname;
+
+	return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
+}
+
+void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+{
+	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
+
+	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+		*c1 = '+';
+	else
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+			*c1 = '-';
+
+	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+		*c2 = '+';
+	else
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+			*c2 = '-';
+
+	if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+		*c3 = '-';
+	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
+		*c3 = '+';
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+			*c3 = '?';
+	}
+
+	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+		*c4 = '-';
+	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
+		*c4 = '+';
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+			*c4 = '?';
+	}
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+	char str[128], c1, c2, c3, c4;
+	const char *name;
+
+	get_usage_chars(class, &c1, &c2, &c3, &c4);
+
+	name = class->name;
+	if (!name) {
+		name = __get_key_name(class->key, str);
+		printk(" (%s", name);
+	} else {
+		printk(" (%s", name);
+		if (class->name_version > 1)
+			printk("#%d", class->name_version);
+		if (class->subclass)
+			printk("/%d", class->subclass);
+	}
+	printk("){%c%c%c%c}", c1, c2, c3, c4);
+}
+
+static void print_lockdep_cache(struct lockdep_map *lock)
+{
+	const char *name;
+	char str[128];
+
+	name = lock->name;
+	if (!name)
+		name = __get_key_name(lock->key->subkeys, str);
+
+	printk("%s", name);
+}
+
+static void print_lock(struct held_lock *hlock)
+{
+	print_lock_name(hlock->class);
+	printk(", at: ");
+	print_ip_sym(hlock->acquire_ip);
+}
+
+static void lockdep_print_held_locks(struct task_struct *curr)
+{
+	int i, depth = curr->lockdep_depth;
+
+	if (!depth) {
+		printk("no locks held by %s/%d.\n", curr->comm, curr->pid);
+		return;
+	}
+	printk("%d lock%s held by %s/%d:\n",
+		depth, depth > 1 ? "s" : "", curr->comm, curr->pid);
+
+	for (i = 0; i < depth; i++) {
+		printk(" #%d: ", i);
+		print_lock(curr->held_locks + i);
+	}
+}
+/*
+ * Helper to print a nice hierarchy of lock dependencies:
+ */
+static void print_spaces(int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		printk("  ");
+}
+
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+	int bit;
+
+	print_spaces(depth);
+	printk("->");
+	print_lock_name(class);
+	printk(" ops: %lu", class->ops);
+	printk(" {\n");
+
+	for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+		if (class->usage_mask & (1 << bit)) {
+			int len = depth;
+
+			print_spaces(depth);
+			len += printk("   %s", usage_str[bit]);
+			len += printk(" at:\n");
+			print_stack_trace(class->usage_traces + bit, len);
+		}
+	}
+	print_spaces(depth);
+	printk(" }\n");
+
+	print_spaces(depth);
+	printk(" ... key      at: ");
+	print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk all lock dependencies starting at <entry>:
+ */
+static void print_lock_dependencies(struct lock_class *class, int depth)
+{
+	struct lock_list *entry;
+
+	if (DEBUG_LOCKS_WARN_ON(depth >= 20))
+		return;
+
+	print_lock_class_header(class, depth);
+
+	list_for_each_entry(entry, &class->locks_after, entry) {
+		DEBUG_LOCKS_WARN_ON(!entry->class);
+		print_lock_dependencies(entry->class, depth + 1);
+
+		print_spaces(depth);
+		printk(" ... acquired at:\n");
+		print_stack_trace(&entry->trace, 2);
+		printk("\n");
+	}
+}
+
+/*
+ * Add a new dependency to the head of the list:
+ */
+static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
+			    struct list_head *head, unsigned long ip)
+{
+	struct lock_list *entry;
+	/*
+	 * Lock not present yet - get a new dependency struct and
+	 * add it to the list:
+	 */
+	entry = alloc_list_entry();
+	if (!entry)
+		return 0;
+
+	entry->class = this;
+	save_trace(&entry->trace);
+
+	/*
+	 * Since we never remove from the dependency list, the list can
+	 * be walked lockless by other CPUs, it's only allocation
+	 * that must be protected by the spinlock. But this also means
+	 * we must make new entries visible only once writes to the
+	 * entry become visible - hence the RCU op:
+	 */
+	list_add_tail_rcu(&entry->entry, head);
+
+	return 1;
+}
+
+/*
+ * Recursive, forwards-direction lock-dependency checking, used for
+ * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
+ * checking.
+ *
+ * (to keep the stackframe of the recursive functions small we
+ *  use these global variables, and we also mark various helper
+ *  functions as noinline.)
+ */
+static struct held_lock *check_source, *check_target;
+
+/*
+ * Print a dependency chain entry (this is only done when a deadlock
+ * has been detected):
+ */
+static noinline int
+print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+{
+	if (debug_locks_silent)
+		return 0;
+	printk("\n-> #%u", depth);
+	print_lock_name(target->class);
+	printk(":\n");
+	print_stack_trace(&target->trace, 6);
+
+	return 0;
+}
+
+/*
+ * When a circular dependency is detected, print the
+ * header first:
+ */
+static noinline int
+print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+{
+	struct task_struct *curr = current;
+
+	__raw_spin_unlock(&hash_lock);
+	debug_locks_off();
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n=======================================================\n");
+	printk(  "[ INFO: possible circular locking dependency detected ]\n");
+	printk(  "-------------------------------------------------------\n");
+	printk("%s/%d is trying to acquire lock:\n",
+		curr->comm, curr->pid);
+	print_lock(check_source);
+	printk("\nbut task is already holding lock:\n");
+	print_lock(check_target);
+	printk("\nwhich lock already depends on the new lock.\n\n");
+	printk("\nthe existing dependency chain (in reverse order) is:\n");
+
+	print_circular_bug_entry(entry, depth);
+
+	return 0;
+}
+
+static noinline int print_circular_bug_tail(void)
+{
+	struct task_struct *curr = current;
+	struct lock_list this;
+
+	if (debug_locks_silent)
+		return 0;
+
+	this.class = check_source->class;
+	save_trace(&this.trace);
+	print_circular_bug_entry(&this, 0);
+
+	printk("\nother info that might help us debug this:\n\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+static int noinline print_infinite_recursion_bug(void)
+{
+	__raw_spin_unlock(&hash_lock);
+	DEBUG_LOCKS_WARN_ON(1);
+
+	return 0;
+}
+
+/*
+ * Prove that the dependency graph starting at <entry> can not
+ * lead to <target>. Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct lock_class *source, unsigned int depth)
+{
+	struct lock_list *entry;
+
+	debug_atomic_inc(&nr_cyclic_check_recursions);
+	if (depth > max_recursion_depth)
+		max_recursion_depth = depth;
+	if (depth >= 20)
+		return print_infinite_recursion_bug();
+	/*
+	 * Check this lock's dependency list:
+	 */
+	list_for_each_entry(entry, &source->locks_after, entry) {
+		if (entry->class == check_target->class)
+			return print_circular_bug_header(entry, depth+1);
+		debug_atomic_inc(&nr_cyclic_checks);
+		if (!check_noncircular(entry->class, depth+1))
+			return print_circular_bug_entry(entry, depth+1);
+	}
+	return 1;
+}
+
+static int very_verbose(struct lock_class *class)
+{
+#if VERY_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * Forwards and backwards subgraph searching, for the purposes of
+ * proving that two subgraphs can be connected by a new dependency
+ * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ */
+static enum lock_usage_bit find_usage_bit;
+static struct lock_class *forwards_match, *backwards_match;
+
+/*
+ * Find a node in the forwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <forwards_match>.
+ *
+ * Return 1 otherwise and keep <forwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_forwards(struct lock_class *source, unsigned int depth)
+{
+	struct lock_list *entry;
+	int ret;
+
+	if (depth > max_recursion_depth)
+		max_recursion_depth = depth;
+	if (depth >= 20)
+		return print_infinite_recursion_bug();
+
+	debug_atomic_inc(&nr_find_usage_forwards_checks);
+	if (source->usage_mask & (1 << find_usage_bit)) {
+		forwards_match = source;
+		return 2;
+	}
+
+	/*
+	 * Check this lock's dependency list:
+	 */
+	list_for_each_entry(entry, &source->locks_after, entry) {
+		debug_atomic_inc(&nr_find_usage_forwards_recursions);
+		ret = find_usage_forwards(entry->class, depth+1);
+		if (ret == 2 || ret == 0)
+			return ret;
+	}
+	return 1;
+}
+
+/*
+ * Find a node in the backwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <backwards_match>.
+ *
+ * Return 1 otherwise and keep <backwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_backwards(struct lock_class *source, unsigned int depth)
+{
+	struct lock_list *entry;
+	int ret;
+
+	if (depth > max_recursion_depth)
+		max_recursion_depth = depth;
+	if (depth >= 20)
+		return print_infinite_recursion_bug();
+
+	debug_atomic_inc(&nr_find_usage_backwards_checks);
+	if (source->usage_mask & (1 << find_usage_bit)) {
+		backwards_match = source;
+		return 2;
+	}
+
+	/*
+	 * Check this lock's dependency list:
+	 */
+	list_for_each_entry(entry, &source->locks_before, entry) {
+		debug_atomic_inc(&nr_find_usage_backwards_recursions);
+		ret = find_usage_backwards(entry->class, depth+1);
+		if (ret == 2 || ret == 0)
+			return ret;
+	}
+	return 1;
+}
+
+static int
+print_bad_irq_dependency(struct task_struct *curr,
+			 struct held_lock *prev,
+			 struct held_lock *next,
+			 enum lock_usage_bit bit1,
+			 enum lock_usage_bit bit2,
+			 const char *irqclass)
+{
+	__raw_spin_unlock(&hash_lock);
+	debug_locks_off();
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n======================================================\n");
+	printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+		irqclass, irqclass);
+	printk(  "------------------------------------------------------\n");
+	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+		curr->comm, curr->pid,
+		curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+		curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
+		curr->hardirqs_enabled,
+		curr->softirqs_enabled);
+	print_lock(next);
+
+	printk("\nand this task is already holding:\n");
+	print_lock(prev);
+	printk("which would create a new lock dependency:\n");
+	print_lock_name(prev->class);
+	printk(" ->");
+	print_lock_name(next->class);
+	printk("\n");
+
+	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+		irqclass);
+	print_lock_name(backwards_match);
+	printk("\n... which became %s-irq-safe at:\n", irqclass);
+
+	print_stack_trace(backwards_match->usage_traces + bit1, 1);
+
+	printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+	print_lock_name(forwards_match);
+	printk("\n... which became %s-irq-unsafe at:\n", irqclass);
+	printk("...");
+
+	print_stack_trace(forwards_match->usage_traces + bit2, 1);
+
+	printk("\nother info that might help us debug this:\n\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
+	print_lock_dependencies(backwards_match, 0);
+
+	printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
+	print_lock_dependencies(forwards_match, 0);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+static int
+check_usage(struct task_struct *curr, struct held_lock *prev,
+	    struct held_lock *next, enum lock_usage_bit bit_backwards,
+	    enum lock_usage_bit bit_forwards, const char *irqclass)
+{
+	int ret;
+
+	find_usage_bit = bit_backwards;
+	/* fills in <backwards_match> */
+	ret = find_usage_backwards(prev->class, 0);
+	if (!ret || ret == 1)
+		return ret;
+
+	find_usage_bit = bit_forwards;
+	ret = find_usage_forwards(next->class, 0);
+	if (!ret || ret == 1)
+		return ret;
+	/* ret == 2 */
+	return print_bad_irq_dependency(curr, prev, next,
+			bit_backwards, bit_forwards, irqclass);
+}
+
+#endif
+
+static int
+print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
+		   struct held_lock *next)
+{
+	debug_locks_off();
+	__raw_spin_unlock(&hash_lock);
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n=============================================\n");
+	printk(  "[ INFO: possible recursive locking detected ]\n");
+	printk(  "---------------------------------------------\n");
+	printk("%s/%d is trying to acquire lock:\n",
+		curr->comm, curr->pid);
+	print_lock(next);
+	printk("\nbut task is already holding lock:\n");
+	print_lock(prev);
+
+	printk("\nother info that might help us debug this:\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+/*
+ * Check whether we are holding such a class already.
+ *
+ * (Note that this has to be done separately, because the graph cannot
+ * detect such classes of deadlocks.)
+ *
+ * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ */
+static int
+check_deadlock(struct task_struct *curr, struct held_lock *next,
+	       struct lockdep_map *next_instance, int read)
+{
+	struct held_lock *prev;
+	int i;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		prev = curr->held_locks + i;
+		if (prev->class != next->class)
+			continue;
+		/*
+		 * Allow read-after-read recursion of the same
+		 * lock instance (i.e. read_lock(lock)+read_lock(lock)):
+		 */
+		if ((read == 2) && prev->read &&
+				(prev->instance == next_instance))
+			return 2;
+		return print_deadlock_bug(curr, prev, next);
+	}
+	return 1;
+}
+
+/*
+ * There was a chain-cache miss, and we are about to add a new dependency
+ * to a previous lock. We recursively validate the following rules:
+ *
+ *  - would the adding of the <prev> -> <next> dependency create a
+ *    circular dependency in the graph? [== circular deadlock]
+ *
+ *  - does the new prev->next dependency connect any hardirq-safe lock
+ *    (in the full backwards-subgraph starting at <prev>) with any
+ *    hardirq-unsafe lock (in the full forwards-subgraph starting at
+ *    <next>)? [== illegal lock inversion with hardirq contexts]
+ *
+ *  - does the new prev->next dependency connect any softirq-safe lock
+ *    (in the full backwards-subgraph starting at <prev>) with any
+ *    softirq-unsafe lock (in the full forwards-subgraph starting at
+ *    <next>)? [== illegal lock inversion with softirq contexts]
+ *
+ * any of these scenarios could lead to a deadlock.
+ *
+ * Then if all the validations pass, we add the forwards and backwards
+ * dependency.
+ */
+static int
+check_prev_add(struct task_struct *curr, struct held_lock *prev,
+	       struct held_lock *next)
+{
+	struct lock_list *entry;
+	int ret;
+
+	/*
+	 * Prove that the new <prev> -> <next> dependency would not
+	 * create a circular dependency in the graph. (We do this by
+	 * forward-recursing into the graph starting at <next>, and
+	 * checking whether we can reach <prev>.)
+	 *
+	 * We are using global variables to control the recursion, to
+	 * keep the stackframe size of the recursive functions low:
+	 */
+	check_source = next;
+	check_target = prev;
+	if (!(check_noncircular(next->class, 0)))
+		return print_circular_bug_tail();
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/*
+	 * Prove that the new dependency does not connect a hardirq-safe
+	 * lock with a hardirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
+					LOCK_ENABLED_HARDIRQS, "hard"))
+		return 0;
+
+	/*
+	 * Prove that the new dependency does not connect a hardirq-safe-read
+	 * lock with a hardirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
+					LOCK_ENABLED_HARDIRQS, "hard-read"))
+		return 0;
+
+	/*
+	 * Prove that the new dependency does not connect a softirq-safe
+	 * lock with a softirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
+					LOCK_ENABLED_SOFTIRQS, "soft"))
+		return 0;
+	/*
+	 * Prove that the new dependency does not connect a softirq-safe-read
+	 * lock with a softirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
+					LOCK_ENABLED_SOFTIRQS, "soft"))
+		return 0;
+#endif
+	/*
+	 * For recursive read-locks we do all the dependency checks,
+	 * but we dont store read-triggered dependencies (only
+	 * write-triggered dependencies). This ensures that only the
+	 * write-side dependencies matter, and that if for example a
+	 * write-lock never takes any other locks, then the reads are
+	 * equivalent to a NOP.
+	 */
+	if (next->read == 2 || prev->read == 2)
+		return 1;
+	/*
+	 * Is the <prev> -> <next> dependency already present?
+	 *
+	 * (this may occur even though this is a new chain: consider
+	 *  e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
+	 *  chains - the second one will be new, but L1 already has
+	 *  L2 added to its dependency list, due to the first chain.)
+	 */
+	list_for_each_entry(entry, &prev->class->locks_after, entry) {
+		if (entry->class == next->class)
+			return 2;
+	}
+
+	/*
+	 * Ok, all validations passed, add the new lock
+	 * to the previous lock's dependency list:
+	 */
+	ret = add_lock_to_list(prev->class, next->class,
+			       &prev->class->locks_after, next->acquire_ip);
+	if (!ret)
+		return 0;
+	/*
+	 * Return value of 2 signals 'dependency already added',
+	 * in that case we dont have to add the backlink either.
+	 */
+	if (ret == 2)
+		return 2;
+	ret = add_lock_to_list(next->class, prev->class,
+			       &next->class->locks_before, next->acquire_ip);
+
+	/*
+	 * Debugging printouts:
+	 */
+	if (verbose(prev->class) || verbose(next->class)) {
+		__raw_spin_unlock(&hash_lock);
+		printk("\n new dependency: ");
+		print_lock_name(prev->class);
+		printk(" => ");
+		print_lock_name(next->class);
+		printk("\n");
+		dump_stack();
+		__raw_spin_lock(&hash_lock);
+	}
+	return 1;
+}
+
+/*
+ * Add the dependency to all directly-previous locks that are 'relevant'.
+ * The ones that are relevant are (in increasing distance from curr):
+ * all consecutive trylock entries and the final non-trylock entry - or
+ * the end of this context's lock-chain - whichever comes first.
+ */
+static int
+check_prevs_add(struct task_struct *curr, struct held_lock *next)
+{
+	int depth = curr->lockdep_depth;
+	struct held_lock *hlock;
+
+	/*
+	 * Debugging checks.
+	 *
+	 * Depth must not be zero for a non-head lock:
+	 */
+	if (!depth)
+		goto out_bug;
+	/*
+	 * At least two relevant locks must exist for this
+	 * to be a head:
+	 */
+	if (curr->held_locks[depth].irq_context !=
+			curr->held_locks[depth-1].irq_context)
+		goto out_bug;
+
+	for (;;) {
+		hlock = curr->held_locks + depth-1;
+		/*
+		 * Only non-recursive-read entries get new dependencies
+		 * added:
+		 */
+		if (hlock->read != 2) {
+			check_prev_add(curr, hlock, next);
+			/*
+			 * Stop after the first non-trylock entry,
+			 * as non-trylock entries have added their
+			 * own direct dependencies already, so this
+			 * lock is connected to them indirectly:
+			 */
+			if (!hlock->trylock)
+				break;
+		}
+		depth--;
+		/*
+		 * End of lock-stack?
+		 */
+		if (!depth)
+			break;
+		/*
+		 * Stop the search if we cross into another context:
+		 */
+		if (curr->held_locks[depth].irq_context !=
+				curr->held_locks[depth-1].irq_context)
+			break;
+	}
+	return 1;
+out_bug:
+	__raw_spin_unlock(&hash_lock);
+	DEBUG_LOCKS_WARN_ON(1);
+
+	return 0;
+}
+
+
+/*
+ * Is this the address of a static object:
+ */
+static int static_obj(void *obj)
+{
+	unsigned long start = (unsigned long) &_stext,
+		      end   = (unsigned long) &_end,
+		      addr  = (unsigned long) obj;
+#ifdef CONFIG_SMP
+	int i;
+#endif
+
+	/*
+	 * static variable?
+	 */
+	if ((addr >= start) && (addr < end))
+		return 1;
+
+#ifdef CONFIG_SMP
+	/*
+	 * percpu var?
+	 */
+	for_each_possible_cpu(i) {
+		start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+		end   = (unsigned long) &__per_cpu_end   + per_cpu_offset(i);
+
+		if ((addr >= start) && (addr < end))
+			return 1;
+	}
+#endif
+
+	/*
+	 * module var?
+	 */
+	return is_module_address(addr);
+}
+
+/*
+ * To make lock name printouts unique, we calculate a unique
+ * class->name_version generation counter:
+ */
+static int count_matching_names(struct lock_class *new_class)
+{
+	struct lock_class *class;
+	int count = 0;
+
+	if (!new_class->name)
+		return 0;
+
+	list_for_each_entry(class, &all_lock_classes, lock_entry) {
+		if (new_class->key - new_class->subclass == class->key)
+			return class->name_version;
+		if (class->name && !strcmp(class->name, new_class->name))
+			count = max(count, class->name_version);
+	}
+
+	return count + 1;
+}
+
+extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+	struct lockdep_subclass_key *key;
+	struct list_head *hash_head;
+	struct lock_class *class;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+	/*
+	 * If the architecture calls into lockdep before initializing
+	 * the hashes then we'll warn about it later. (we cannot printk
+	 * right now)
+	 */
+	if (unlikely(!lockdep_initialized)) {
+		lockdep_init();
+		lockdep_init_error = 1;
+	}
+#endif
+
+	/*
+	 * Static locks do not have their class-keys yet - for them the key
+	 * is the lock object itself:
+	 */
+	if (unlikely(!lock->key))
+		lock->key = (void *)lock;
+
+	/*
+	 * NOTE: the class-key must be unique. For dynamic locks, a static
+	 * lock_class_key variable is passed in through the mutex_init()
+	 * (or spin_lock_init()) call - which acts as the key. For static
+	 * locks we use the lock object itself as the key.
+	 */
+	if (sizeof(struct lock_class_key) > sizeof(struct lock_class))
+		__error_too_big_MAX_LOCKDEP_SUBCLASSES();
+
+	key = lock->key->subkeys + subclass;
+
+	hash_head = classhashentry(key);
+
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(class, hash_head, hash_entry)
+		if (class->key == key)
+			goto out_set;
+
+	/*
+	 * Debug-check: all keys must be persistent!
+ 	 */
+	if (!static_obj(lock->key)) {
+		debug_locks_off();
+		printk("INFO: trying to register non-static key.\n");
+		printk("the code is fine but needs lockdep annotation.\n");
+		printk("turning off the locking correctness validator.\n");
+		dump_stack();
+
+		return NULL;
+	}
+
+	__raw_spin_lock(&hash_lock);
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(class, hash_head, hash_entry)
+		if (class->key == key)
+			goto out_unlock_set;
+	/*
+	 * Allocate a new key from the static array, and add it to
+	 * the hash:
+	 */
+	if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+		__raw_spin_unlock(&hash_lock);
+		debug_locks_off();
+		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
+		printk("turning off the locking correctness validator.\n");
+		return NULL;
+	}
+	class = lock_classes + nr_lock_classes++;
+	debug_atomic_inc(&nr_unused_locks);
+	class->key = key;
+	class->name = lock->name;
+	class->subclass = subclass;
+	INIT_LIST_HEAD(&class->lock_entry);
+	INIT_LIST_HEAD(&class->locks_before);
+	INIT_LIST_HEAD(&class->locks_after);
+	class->name_version = count_matching_names(class);
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&class->hash_entry, hash_head);
+
+	if (verbose(class)) {
+		__raw_spin_unlock(&hash_lock);
+		printk("\nnew class %p: %s", class->key, class->name);
+		if (class->name_version > 1)
+			printk("#%d", class->name_version);
+		printk("\n");
+		dump_stack();
+		__raw_spin_lock(&hash_lock);
+	}
+out_unlock_set:
+	__raw_spin_unlock(&hash_lock);
+
+out_set:
+	lock->class[subclass] = class;
+
+	DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
+
+	return class;
+}
+
+/*
+ * Look up a dependency chain. If the key is not present yet then
+ * add it and return 0 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 1.
+ */
+static inline int lookup_chain_cache(u64 chain_key)
+{
+	struct list_head *hash_head = chainhashentry(chain_key);
+	struct lock_chain *chain;
+
+	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+	/*
+	 * We can walk it lock-free, because entries only get added
+	 * to the hash:
+	 */
+	list_for_each_entry(chain, hash_head, entry) {
+		if (chain->chain_key == chain_key) {
+cache_hit:
+			debug_atomic_inc(&chain_lookup_hits);
+			/*
+			 * In the debugging case, force redundant checking
+			 * by returning 1:
+			 */
+#ifdef CONFIG_DEBUG_LOCKDEP
+			__raw_spin_lock(&hash_lock);
+			return 1;
+#endif
+			return 0;
+		}
+	}
+	/*
+	 * Allocate a new chain entry from the static array, and add
+	 * it to the hash:
+	 */
+	__raw_spin_lock(&hash_lock);
+	/*
+	 * We have to walk the chain again locked - to avoid duplicates:
+	 */
+	list_for_each_entry(chain, hash_head, entry) {
+		if (chain->chain_key == chain_key) {
+			__raw_spin_unlock(&hash_lock);
+			goto cache_hit;
+		}
+	}
+	if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+		__raw_spin_unlock(&hash_lock);
+		debug_locks_off();
+		printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
+		printk("turning off the locking correctness validator.\n");
+		return 0;
+	}
+	chain = lock_chains + nr_lock_chains++;
+	chain->chain_key = chain_key;
+	list_add_tail_rcu(&chain->entry, hash_head);
+	debug_atomic_inc(&chain_lookup_misses);
+#ifdef CONFIG_TRACE_IRQFLAGS
+	if (current->hardirq_context)
+		nr_hardirq_chains++;
+	else {
+		if (current->softirq_context)
+			nr_softirq_chains++;
+		else
+			nr_process_chains++;
+	}
+#else
+	nr_process_chains++;
+#endif
+
+	return 1;
+}
+
+/*
+ * We are building curr_chain_key incrementally, so double-check
+ * it from scratch, to make sure that it's done correctly:
+ */
+static void check_chain_key(struct task_struct *curr)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+	struct held_lock *hlock, *prev_hlock = NULL;
+	unsigned int i, id;
+	u64 chain_key = 0;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		hlock = curr->held_locks + i;
+		if (chain_key != hlock->prev_chain_key) {
+			debug_locks_off();
+			printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+				curr->lockdep_depth, i,
+				(unsigned long long)chain_key,
+				(unsigned long long)hlock->prev_chain_key);
+			WARN_ON(1);
+			return;
+		}
+		id = hlock->class - lock_classes;
+		DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS);
+		if (prev_hlock && (prev_hlock->irq_context !=
+							hlock->irq_context))
+			chain_key = 0;
+		chain_key = iterate_chain_key(chain_key, id);
+		prev_hlock = hlock;
+	}
+	if (chain_key != curr->curr_chain_key) {
+		debug_locks_off();
+		printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+			curr->lockdep_depth, i,
+			(unsigned long long)chain_key,
+			(unsigned long long)curr->curr_chain_key);
+		WARN_ON(1);
+	}
+#endif
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * print irq inversion bug:
+ */
+static int
+print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
+			struct held_lock *this, int forwards,
+			const char *irqclass)
+{
+	__raw_spin_unlock(&hash_lock);
+	debug_locks_off();
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n=========================================================\n");
+	printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+	printk(  "---------------------------------------------------------\n");
+	printk("%s/%d just changed the state of lock:\n",
+		curr->comm, curr->pid);
+	print_lock(this);
+	if (forwards)
+		printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+	else
+		printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+	print_lock_name(other);
+	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+
+	printk("\nother info that might help us debug this:\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nthe first lock's dependencies:\n");
+	print_lock_dependencies(this->class, 0);
+
+	printk("\nthe second lock's dependencies:\n");
+	print_lock_dependencies(other, 0);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+/*
+ * Prove that in the forwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_forwards(struct task_struct *curr, struct held_lock *this,
+		     enum lock_usage_bit bit, const char *irqclass)
+{
+	int ret;
+
+	find_usage_bit = bit;
+	/* fills in <forwards_match> */
+	ret = find_usage_forwards(this->class, 0);
+	if (!ret || ret == 1)
+		return ret;
+
+	return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
+}
+
+/*
+ * Prove that in the backwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_backwards(struct task_struct *curr, struct held_lock *this,
+		      enum lock_usage_bit bit, const char *irqclass)
+{
+	int ret;
+
+	find_usage_bit = bit;
+	/* fills in <backwards_match> */
+	ret = find_usage_backwards(this->class, 0);
+	if (!ret || ret == 1)
+		return ret;
+
+	return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
+}
+
+static inline void print_irqtrace_events(struct task_struct *curr)
+{
+	printk("irq event stamp: %u\n", curr->irq_events);
+	printk("hardirqs last  enabled at (%u): ", curr->hardirq_enable_event);
+	print_ip_sym(curr->hardirq_enable_ip);
+	printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
+	print_ip_sym(curr->hardirq_disable_ip);
+	printk("softirqs last  enabled at (%u): ", curr->softirq_enable_event);
+	print_ip_sym(curr->softirq_enable_ip);
+	printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
+	print_ip_sym(curr->softirq_disable_ip);
+}
+
+#else
+static inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+static int
+print_usage_bug(struct task_struct *curr, struct held_lock *this,
+		enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+{
+	__raw_spin_unlock(&hash_lock);
+	debug_locks_off();
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n=================================\n");
+	printk(  "[ INFO: inconsistent lock state ]\n");
+	printk(  "---------------------------------\n");
+
+	printk("inconsistent {%s} -> {%s} usage.\n",
+		usage_str[prev_bit], usage_str[new_bit]);
+
+	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+		curr->comm, curr->pid,
+		trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+		trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+		trace_hardirqs_enabled(curr),
+		trace_softirqs_enabled(curr));
+	print_lock(this);
+
+	printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+	print_stack_trace(this->class->usage_traces + prev_bit, 1);
+
+	print_irqtrace_events(curr);
+	printk("\nother info that might help us debug this:\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+/*
+ * Print out an error if an invalid bit is set:
+ */
+static inline int
+valid_state(struct task_struct *curr, struct held_lock *this,
+	    enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+{
+	if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+		return print_usage_bug(curr, this, bad_bit, new_bit);
+	return 1;
+}
+
+#define STRICT_READ_CHECKS	1
+
+/*
+ * Mark a lock with a usage bit, and validate the state transition:
+ */
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+		     enum lock_usage_bit new_bit, unsigned long ip)
+{
+	unsigned int new_mask = 1 << new_bit, ret = 1;
+
+	/*
+	 * If already set then do not dirty the cacheline,
+	 * nor do any checks:
+	 */
+	if (likely(this->class->usage_mask & new_mask))
+		return 1;
+
+	__raw_spin_lock(&hash_lock);
+	/*
+	 * Make sure we didnt race:
+	 */
+	if (unlikely(this->class->usage_mask & new_mask)) {
+		__raw_spin_unlock(&hash_lock);
+		return 1;
+	}
+
+	this->class->usage_mask |= new_mask;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	if (new_bit == LOCK_ENABLED_HARDIRQS ||
+			new_bit == LOCK_ENABLED_HARDIRQS_READ)
+		ip = curr->hardirq_enable_ip;
+	else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
+			new_bit == LOCK_ENABLED_SOFTIRQS_READ)
+		ip = curr->softirq_enable_ip;
+#endif
+	if (!save_trace(this->class->usage_traces + new_bit))
+		return 0;
+
+	switch (new_bit) {
+#ifdef CONFIG_TRACE_IRQFLAGS
+	case LOCK_USED_IN_HARDIRQ:
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_ENABLED_HARDIRQS_READ))
+			return 0;
+		/*
+		 * just marked it hardirq-safe, check that this lock
+		 * took no hardirq-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_ENABLED_HARDIRQS, "hard"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it hardirq-safe, check that this lock
+		 * took no hardirq-unsafe-read lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+				LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
+			return 0;
+#endif
+		if (hardirq_verbose(this->class))
+			ret = 2;
+		break;
+	case LOCK_USED_IN_SOFTIRQ:
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_ENABLED_SOFTIRQS_READ))
+			return 0;
+		/*
+		 * just marked it softirq-safe, check that this lock
+		 * took no softirq-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_ENABLED_SOFTIRQS, "soft"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it softirq-safe, check that this lock
+		 * took no softirq-unsafe-read lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+				LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
+			return 0;
+#endif
+		if (softirq_verbose(this->class))
+			ret = 2;
+		break;
+	case LOCK_USED_IN_HARDIRQ_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+			return 0;
+		/*
+		 * just marked it hardirq-read-safe, check that this lock
+		 * took no hardirq-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_ENABLED_HARDIRQS, "hard"))
+			return 0;
+		if (hardirq_verbose(this->class))
+			ret = 2;
+		break;
+	case LOCK_USED_IN_SOFTIRQ_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+			return 0;
+		/*
+		 * just marked it softirq-read-safe, check that this lock
+		 * took no softirq-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_ENABLED_SOFTIRQS, "soft"))
+			return 0;
+		if (softirq_verbose(this->class))
+			ret = 2;
+		break;
+	case LOCK_ENABLED_HARDIRQS:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_USED_IN_HARDIRQ_READ))
+			return 0;
+		/*
+		 * just marked it hardirq-unsafe, check that no hardirq-safe
+		 * lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_HARDIRQ, "hard"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it hardirq-unsafe, check that no
+		 * hardirq-safe-read lock in the system ever took
+		 * it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+				   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
+			return 0;
+#endif
+		if (hardirq_verbose(this->class))
+			ret = 2;
+		break;
+	case LOCK_ENABLED_SOFTIRQS:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_USED_IN_SOFTIRQ_READ))
+			return 0;
+		/*
+		 * just marked it softirq-unsafe, check that no softirq-safe
+		 * lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_SOFTIRQ, "soft"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it softirq-unsafe, check that no
+		 * softirq-safe-read lock in the system ever took
+		 * it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+				   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
+			return 0;
+#endif
+		if (softirq_verbose(this->class))
+			ret = 2;
+		break;
+	case LOCK_ENABLED_HARDIRQS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it hardirq-read-unsafe, check that no
+		 * hardirq-safe lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_HARDIRQ, "hard"))
+			return 0;
+#endif
+		if (hardirq_verbose(this->class))
+			ret = 2;
+		break;
+	case LOCK_ENABLED_SOFTIRQS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it softirq-read-unsafe, check that no
+		 * softirq-safe lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_SOFTIRQ, "soft"))
+			return 0;
+#endif
+		if (softirq_verbose(this->class))
+			ret = 2;
+		break;
+#endif
+	case LOCK_USED:
+		/*
+		 * Add it to the global list of classes:
+		 */
+		list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
+		debug_atomic_dec(&nr_unused_locks);
+		break;
+	default:
+		debug_locks_off();
+		WARN_ON(1);
+		return 0;
+	}
+
+	__raw_spin_unlock(&hash_lock);
+
+	/*
+	 * We must printk outside of the hash_lock:
+	 */
+	if (ret == 2) {
+		printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
+		print_lock(this);
+		print_irqtrace_events(curr);
+		dump_stack();
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * Mark all held locks with a usage bit:
+ */
+static int
+mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+{
+	enum lock_usage_bit usage_bit;
+	struct held_lock *hlock;
+	int i;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		hlock = curr->held_locks + i;
+
+		if (hardirq) {
+			if (hlock->read)
+				usage_bit = LOCK_ENABLED_HARDIRQS_READ;
+			else
+				usage_bit = LOCK_ENABLED_HARDIRQS;
+		} else {
+			if (hlock->read)
+				usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
+			else
+				usage_bit = LOCK_ENABLED_SOFTIRQS;
+		}
+		if (!mark_lock(curr, hlock, usage_bit, ip))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Debugging helper: via this flag we know that we are in
+ * 'early bootup code', and will warn about any invalid irqs-on event:
+ */
+static int early_boot_irqs_enabled;
+
+void early_boot_irqs_off(void)
+{
+	early_boot_irqs_enabled = 0;
+}
+
+void early_boot_irqs_on(void)
+{
+	early_boot_irqs_enabled = 1;
+}
+
+/*
+ * Hardirqs will be enabled:
+ */
+void trace_hardirqs_on(void)
+{
+	struct task_struct *curr = current;
+	unsigned long ip;
+
+	if (unlikely(!debug_locks || current->lockdep_recursion))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+		return;
+
+	if (unlikely(curr->hardirqs_enabled)) {
+		debug_atomic_inc(&redundant_hardirqs_on);
+		return;
+	}
+	/* we'll do an OFF -> ON transition: */
+	curr->hardirqs_enabled = 1;
+	ip = (unsigned long) __builtin_return_address(0);
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return;
+	if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+		return;
+	/*
+	 * We are going to turn hardirqs on, so set the
+	 * usage bit for all held locks:
+	 */
+	if (!mark_held_locks(curr, 1, ip))
+		return;
+	/*
+	 * If we have softirqs enabled, then set the usage
+	 * bit for all held locks. (disabled hardirqs prevented
+	 * this bit from being set before)
+	 */
+	if (curr->softirqs_enabled)
+		if (!mark_held_locks(curr, 0, ip))
+			return;
+
+	curr->hardirq_enable_ip = ip;
+	curr->hardirq_enable_event = ++curr->irq_events;
+	debug_atomic_inc(&hardirqs_on_events);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+/*
+ * Hardirqs were disabled:
+ */
+void trace_hardirqs_off(void)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks || current->lockdep_recursion))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return;
+
+	if (curr->hardirqs_enabled) {
+		/*
+		 * We have done an ON -> OFF transition:
+		 */
+		curr->hardirqs_enabled = 0;
+		curr->hardirq_disable_ip = _RET_IP_;
+		curr->hardirq_disable_event = ++curr->irq_events;
+		debug_atomic_inc(&hardirqs_off_events);
+	} else
+		debug_atomic_inc(&redundant_hardirqs_off);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+/*
+ * Softirqs will be enabled:
+ */
+void trace_softirqs_on(unsigned long ip)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return;
+
+	if (curr->softirqs_enabled) {
+		debug_atomic_inc(&redundant_softirqs_on);
+		return;
+	}
+
+	/*
+	 * We'll do an OFF -> ON transition:
+	 */
+	curr->softirqs_enabled = 1;
+	curr->softirq_enable_ip = ip;
+	curr->softirq_enable_event = ++curr->irq_events;
+	debug_atomic_inc(&softirqs_on_events);
+	/*
+	 * We are going to turn softirqs on, so set the
+	 * usage bit for all held locks, if hardirqs are
+	 * enabled too:
+	 */
+	if (curr->hardirqs_enabled)
+		mark_held_locks(curr, 0, ip);
+}
+
+/*
+ * Softirqs were disabled:
+ */
+void trace_softirqs_off(unsigned long ip)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return;
+
+	if (curr->softirqs_enabled) {
+		/*
+		 * We have done an ON -> OFF transition:
+		 */
+		curr->softirqs_enabled = 0;
+		curr->softirq_disable_ip = ip;
+		curr->softirq_disable_event = ++curr->irq_events;
+		debug_atomic_inc(&softirqs_off_events);
+		DEBUG_LOCKS_WARN_ON(!softirq_count());
+	} else
+		debug_atomic_inc(&redundant_softirqs_off);
+}
+
+#endif
+
+/*
+ * Initialize a lock instance's lock-class mapping info:
+ */
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+		      struct lock_class_key *key)
+{
+	if (unlikely(!debug_locks))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(!key))
+		return;
+	if (DEBUG_LOCKS_WARN_ON(!name))
+		return;
+	/*
+	 * Sanity check, the lock-class key must be persistent:
+	 */
+	if (!static_obj(key)) {
+		printk("BUG: key %p not in .data!\n", key);
+		DEBUG_LOCKS_WARN_ON(1);
+		return;
+	}
+	lock->name = name;
+	lock->key = key;
+	memset(lock->class, 0, sizeof(lock->class[0])*MAX_LOCKDEP_SUBCLASSES);
+}
+
+EXPORT_SYMBOL_GPL(lockdep_init_map);
+
+/*
+ * This gets called for every mutex_lock*()/spin_lock*() operation.
+ * We maintain the dependency maps and validate the locking attempt:
+ */
+static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+			  int trylock, int read, int check, int hardirqs_off,
+			  unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock;
+	struct lock_class *class;
+	unsigned int depth, id;
+	int chain_head = 0;
+	u64 chain_key;
+
+	if (unlikely(!debug_locks))
+		return 0;
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return 0;
+
+	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+		debug_locks_off();
+		printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
+		printk("turning off the locking correctness validator.\n");
+		return 0;
+	}
+
+	class = lock->class[subclass];
+	/* not cached yet? */
+	if (unlikely(!class)) {
+		class = register_lock_class(lock, subclass);
+		if (!class)
+			return 0;
+	}
+	debug_atomic_inc((atomic_t *)&class->ops);
+	if (very_verbose(class)) {
+		printk("\nacquire class [%p] %s", class->key, class->name);
+		if (class->name_version > 1)
+			printk("#%d", class->name_version);
+		printk("\n");
+		dump_stack();
+	}
+
+	/*
+	 * Add the lock to the list of currently held locks.
+	 * (we dont increase the depth just yet, up until the
+	 * dependency checks are done)
+	 */
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
+		return 0;
+
+	hlock = curr->held_locks + depth;
+
+	hlock->class = class;
+	hlock->acquire_ip = ip;
+	hlock->instance = lock;
+	hlock->trylock = trylock;
+	hlock->read = read;
+	hlock->check = check;
+	hlock->hardirqs_off = hardirqs_off;
+
+	if (check != 2)
+		goto out_calc_hash;
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/*
+	 * If non-trylock use in a hardirq or softirq context, then
+	 * mark the lock as used in these contexts:
+	 */
+	if (!trylock) {
+		if (read) {
+			if (curr->hardirq_context)
+				if (!mark_lock(curr, hlock,
+						LOCK_USED_IN_HARDIRQ_READ, ip))
+					return 0;
+			if (curr->softirq_context)
+				if (!mark_lock(curr, hlock,
+						LOCK_USED_IN_SOFTIRQ_READ, ip))
+					return 0;
+		} else {
+			if (curr->hardirq_context)
+				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+					return 0;
+			if (curr->softirq_context)
+				if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+					return 0;
+		}
+	}
+	if (!hardirqs_off) {
+		if (read) {
+			if (!mark_lock(curr, hlock,
+					LOCK_ENABLED_HARDIRQS_READ, ip))
+				return 0;
+			if (curr->softirqs_enabled)
+				if (!mark_lock(curr, hlock,
+						LOCK_ENABLED_SOFTIRQS_READ, ip))
+					return 0;
+		} else {
+			if (!mark_lock(curr, hlock,
+					LOCK_ENABLED_HARDIRQS, ip))
+				return 0;
+			if (curr->softirqs_enabled)
+				if (!mark_lock(curr, hlock,
+						LOCK_ENABLED_SOFTIRQS, ip))
+					return 0;
+		}
+	}
+#endif
+	/* mark it as used: */
+	if (!mark_lock(curr, hlock, LOCK_USED, ip))
+		return 0;
+out_calc_hash:
+	/*
+	 * Calculate the chain hash: it's the combined has of all the
+	 * lock keys along the dependency chain. We save the hash value
+	 * at every step so that we can get the current hash easily
+	 * after unlock. The chain hash is then used to cache dependency
+	 * results.
+	 *
+	 * The 'key ID' is what is the most compact key value to drive
+	 * the hash, not class->key.
+	 */
+	id = class - lock_classes;
+	if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+		return 0;
+
+	chain_key = curr->curr_chain_key;
+	if (!depth) {
+		if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+			return 0;
+		chain_head = 1;
+	}
+
+	hlock->prev_chain_key = chain_key;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/*
+	 * Keep track of points where we cross into an interrupt context:
+	 */
+	hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
+				curr->softirq_context;
+	if (depth) {
+		struct held_lock *prev_hlock;
+
+		prev_hlock = curr->held_locks + depth-1;
+		/*
+		 * If we cross into another context, reset the
+		 * hash key (this also prevents the checking and the
+		 * adding of the dependency to 'prev'):
+		 */
+		if (prev_hlock->irq_context != hlock->irq_context) {
+			chain_key = 0;
+			chain_head = 1;
+		}
+	}
+#endif
+	chain_key = iterate_chain_key(chain_key, id);
+	curr->curr_chain_key = chain_key;
+
+	/*
+	 * Trylock needs to maintain the stack of held locks, but it
+	 * does not add new dependencies, because trylock can be done
+	 * in any order.
+	 *
+	 * We look up the chain_key and do the O(N^2) check and update of
+	 * the dependencies only if this is a new dependency chain.
+	 * (If lookup_chain_cache() returns with 1 it acquires
+	 * hash_lock for us)
+	 */
+	if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
+		/*
+		 * Check whether last held lock:
+		 *
+		 * - is irq-safe, if this lock is irq-unsafe
+		 * - is softirq-safe, if this lock is hardirq-unsafe
+		 *
+		 * And check whether the new lock's dependency graph
+		 * could lead back to the previous lock.
+		 *
+		 * any of these scenarios could lead to a deadlock. If
+		 * All validations
+		 */
+		int ret = check_deadlock(curr, hlock, lock, read);
+
+		if (!ret)
+			return 0;
+		/*
+		 * Mark recursive read, as we jump over it when
+		 * building dependencies (just like we jump over
+		 * trylock entries):
+		 */
+		if (ret == 2)
+			hlock->read = 2;
+		/*
+		 * Add dependency only if this lock is not the head
+		 * of the chain, and if it's not a secondary read-lock:
+		 */
+		if (!chain_head && ret != 2)
+			if (!check_prevs_add(curr, hlock))
+				return 0;
+		__raw_spin_unlock(&hash_lock);
+	}
+	curr->lockdep_depth++;
+	check_chain_key(curr);
+	if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
+		debug_locks_off();
+		printk("BUG: MAX_LOCK_DEPTH too low!\n");
+		printk("turning off the locking correctness validator.\n");
+		return 0;
+	}
+	if (unlikely(curr->lockdep_depth > max_lockdep_depth))
+		max_lockdep_depth = curr->lockdep_depth;
+
+	return 1;
+}
+
+static int
+print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+			   unsigned long ip)
+{
+	if (!debug_locks_off())
+		return 0;
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n=====================================\n");
+	printk(  "[ BUG: bad unlock balance detected! ]\n");
+	printk(  "-------------------------------------\n");
+	printk("%s/%d is trying to release lock (",
+		curr->comm, curr->pid);
+	print_lockdep_cache(lock);
+	printk(") at:\n");
+	print_ip_sym(ip);
+	printk("but there are no more locks to release!\n");
+	printk("\nother info that might help us debug this:\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+/*
+ * Common debugging checks for both nested and non-nested unlock:
+ */
+static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
+			unsigned long ip)
+{
+	if (unlikely(!debug_locks))
+		return 0;
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return 0;
+
+	if (curr->lockdep_depth <= 0)
+		return print_unlock_inbalance_bug(curr, lock, ip);
+
+	return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks in a
+ * potentially non-nested (out of order) manner. This is a
+ * relatively rare operation, as all the unlock APIs default
+ * to nested mode (which uses lock_release()):
+ */
+static int
+lock_release_non_nested(struct task_struct *curr,
+			struct lockdep_map *lock, unsigned long ip)
+{
+	struct held_lock *hlock, *prev_hlock;
+	unsigned int depth;
+	int i;
+
+	/*
+	 * Check whether the lock exists in the current stack
+	 * of held locks:
+	 */
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return 0;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+	/*
+	 * We have the right lock to unlock, 'hlock' points to it.
+	 * Now we remove it from the stack, and add back the other
+	 * entries (if any), recalculating the hash along the way:
+	 */
+	curr->lockdep_depth = i;
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+	for (i++; i < depth; i++) {
+		hlock = curr->held_locks + i;
+		if (!__lock_acquire(hlock->instance,
+			hlock->class->subclass, hlock->trylock,
+				hlock->read, hlock->check, hlock->hardirqs_off,
+				hlock->acquire_ip))
+			return 0;
+	}
+
+	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
+		return 0;
+	return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static int lock_release_nested(struct task_struct *curr,
+			       struct lockdep_map *lock, unsigned long ip)
+{
+	struct held_lock *hlock;
+	unsigned int depth;
+
+	/*
+	 * Pop off the top of the lock stack:
+	 */
+	depth = curr->lockdep_depth - 1;
+	hlock = curr->held_locks + depth;
+
+	/*
+	 * Is the unlock non-nested:
+	 */
+	if (hlock->instance != lock)
+		return lock_release_non_nested(curr, lock, ip);
+	curr->lockdep_depth--;
+
+	if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
+		return 0;
+
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+	hlock->prev_chain_key = 0;
+	hlock->class = NULL;
+	hlock->acquire_ip = 0;
+	hlock->irq_context = 0;
+#endif
+	return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static void
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+	struct task_struct *curr = current;
+
+	if (!check_unlock(curr, lock, ip))
+		return;
+
+	if (nested) {
+		if (!lock_release_nested(curr, lock, ip))
+			return;
+	} else {
+		if (!lock_release_non_nested(curr, lock, ip))
+			return;
+	}
+
+	check_chain_key(curr);
+}
+
+/*
+ * Check whether we follow the irq-flags state precisely:
+ */
+static void check_flags(unsigned long flags)
+{
+#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
+	if (!debug_locks)
+		return;
+
+	if (irqs_disabled_flags(flags))
+		DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled);
+	else
+		DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled);
+
+	/*
+	 * We dont accurately track softirq state in e.g.
+	 * hardirq contexts (such as on 4KSTACKS), so only
+	 * check if not in hardirq contexts:
+	 */
+	if (!hardirq_count()) {
+		if (softirq_count())
+			DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
+		else
+			DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+	}
+
+	if (!debug_locks)
+		print_irqtrace_events(current);
+#endif
+}
+
+/*
+ * We are not always called with irqs disabled - do that here,
+ * and also avoid lockdep recursion:
+ */
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+		  int trylock, int read, int check, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_acquire(lock, subclass, trylock, read, check,
+		       irqs_disabled_flags(flags), ip);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_acquire);
+
+void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	__lock_release(lock, nested, ip);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_release);
+
+/*
+ * Used by the testsuite, sanitize the validator state
+ * after a simulated failure:
+ */
+
+void lockdep_reset(void)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	current->curr_chain_key = 0;
+	current->lockdep_depth = 0;
+	current->lockdep_recursion = 0;
+	memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
+	nr_hardirq_chains = 0;
+	nr_softirq_chains = 0;
+	nr_process_chains = 0;
+	debug_locks = 1;
+	raw_local_irq_restore(flags);
+}
+
+static void zap_class(struct lock_class *class)
+{
+	int i;
+
+	/*
+	 * Remove all dependencies this lock is
+	 * involved in:
+	 */
+	for (i = 0; i < nr_list_entries; i++) {
+		if (list_entries[i].class == class)
+			list_del_rcu(&list_entries[i].entry);
+	}
+	/*
+	 * Unhash the class and remove it from the all_lock_classes list:
+	 */
+	list_del_rcu(&class->hash_entry);
+	list_del_rcu(&class->lock_entry);
+
+}
+
+static inline int within(void *addr, void *start, unsigned long size)
+{
+	return addr >= start && addr < start + size;
+}
+
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+	struct lock_class *class, *next;
+	struct list_head *head;
+	unsigned long flags;
+	int i;
+
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&hash_lock);
+
+	/*
+	 * Unhash all classes that were created by this module:
+	 */
+	for (i = 0; i < CLASSHASH_SIZE; i++) {
+		head = classhash_table + i;
+		if (list_empty(head))
+			continue;
+		list_for_each_entry_safe(class, next, head, hash_entry)
+			if (within(class->key, start, size))
+				zap_class(class);
+	}
+
+	__raw_spin_unlock(&hash_lock);
+	raw_local_irq_restore(flags);
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+	struct lock_class *class, *next, *entry;
+	struct list_head *head;
+	unsigned long flags;
+	int i, j;
+
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&hash_lock);
+
+	/*
+	 * Remove all classes this lock has:
+	 */
+	for (i = 0; i < CLASSHASH_SIZE; i++) {
+		head = classhash_table + i;
+		if (list_empty(head))
+			continue;
+		list_for_each_entry_safe(class, next, head, hash_entry) {
+			for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+				entry = lock->class[j];
+				if (class == entry) {
+					zap_class(class);
+					lock->class[j] = NULL;
+					break;
+				}
+			}
+		}
+	}
+
+	/*
+	 * Debug check: in the end all mapped classes should
+	 * be gone.
+	 */
+	for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+		entry = lock->class[j];
+		if (!entry)
+			continue;
+		__raw_spin_unlock(&hash_lock);
+		DEBUG_LOCKS_WARN_ON(1);
+		raw_local_irq_restore(flags);
+		return;
+	}
+
+	__raw_spin_unlock(&hash_lock);
+	raw_local_irq_restore(flags);
+}
+
+void __init lockdep_init(void)
+{
+	int i;
+
+	/*
+	 * Some architectures have their own start_kernel()
+	 * code which calls lockdep_init(), while we also
+	 * call lockdep_init() from the start_kernel() itself,
+	 * and we want to initialize the hashes only once:
+	 */
+	if (lockdep_initialized)
+		return;
+
+	for (i = 0; i < CLASSHASH_SIZE; i++)
+		INIT_LIST_HEAD(classhash_table + i);
+
+	for (i = 0; i < CHAINHASH_SIZE; i++)
+		INIT_LIST_HEAD(chainhash_table + i);
+
+	lockdep_initialized = 1;
+}
+
+void __init lockdep_info(void)
+{
+	printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+	printk("... MAX_LOCKDEP_SUBCLASSES:    %lu\n", MAX_LOCKDEP_SUBCLASSES);
+	printk("... MAX_LOCK_DEPTH:          %lu\n", MAX_LOCK_DEPTH);
+	printk("... MAX_LOCKDEP_KEYS:        %lu\n", MAX_LOCKDEP_KEYS);
+	printk("... CLASSHASH_SIZE:           %lu\n", CLASSHASH_SIZE);
+	printk("... MAX_LOCKDEP_ENTRIES:     %lu\n", MAX_LOCKDEP_ENTRIES);
+	printk("... MAX_LOCKDEP_CHAINS:      %lu\n", MAX_LOCKDEP_CHAINS);
+	printk("... CHAINHASH_SIZE:          %lu\n", CHAINHASH_SIZE);
+
+	printk(" memory used by lock dependency info: %lu kB\n",
+		(sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
+		sizeof(struct list_head) * CLASSHASH_SIZE +
+		sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
+		sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
+		sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
+
+	printk(" per task-struct memory footprint: %lu bytes\n",
+		sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+	if (lockdep_init_error)
+		printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n");
+#endif
+}
+
+static inline int in_range(const void *start, const void *addr, const void *end)
+{
+	return addr >= start && addr <= end;
+}
+
+static void
+print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
+		     const void *mem_to)
+{
+	if (!debug_locks_off())
+		return;
+	if (debug_locks_silent)
+		return;
+
+	printk("\n=========================\n");
+	printk(  "[ BUG: held lock freed! ]\n");
+	printk(  "-------------------------\n");
+	printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+		curr->comm, curr->pid, mem_from, mem_to-1);
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a lock
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
+{
+	const void *mem_to = mem_from + mem_len, *lock_from, *lock_to;
+	struct task_struct *curr = current;
+	struct held_lock *hlock;
+	unsigned long flags;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	local_irq_save(flags);
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		hlock = curr->held_locks + i;
+
+		lock_from = (void *)hlock->instance;
+		lock_to = (void *)(hlock->instance + 1);
+
+		if (!in_range(mem_from, lock_from, mem_to) &&
+					!in_range(mem_from, lock_to, mem_to))
+			continue;
+
+		print_freed_lock_bug(curr, mem_from, mem_to);
+		break;
+	}
+	local_irq_restore(flags);
+}
+
+static void print_held_locks_bug(struct task_struct *curr)
+{
+	if (!debug_locks_off())
+		return;
+	if (debug_locks_silent)
+		return;
+
+	printk("\n=====================================\n");
+	printk(  "[ BUG: lock held at task exit time! ]\n");
+	printk(  "-------------------------------------\n");
+	printk("%s/%d is exiting with locks still held!\n",
+		curr->comm, curr->pid);
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+}
+
+void debug_check_no_locks_held(struct task_struct *task)
+{
+	if (unlikely(task->lockdep_depth > 0))
+		print_held_locks_bug(task);
+}
+
+void debug_show_all_locks(void)
+{
+	struct task_struct *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\nShowing all locks held in the system:\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		if (p->lockdep_depth)
+			lockdep_print_held_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_all_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+	lockdep_print_held_locks(task);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_held_locks);
+
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
new file mode 100644
index 000000000000..0d355f24fe04
--- /dev/null
+++ b/kernel/lockdep_internals.h
@@ -0,0 +1,78 @@
+/*
+ * kernel/lockdep_internals.h
+ *
+ * Runtime locking correctness validator
+ *
+ * lockdep subsystem internal functions and variables.
+ */
+
+/*
+ * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
+ * we track.
+ *
+ * We use the per-lock dependency maps in two ways: we grow it by adding
+ * every to-be-taken lock to all currently held lock's own dependency
+ * table (if it's not there yet), and we check it for lock order
+ * conflicts and deadlocks.
+ */
+#define MAX_LOCKDEP_ENTRIES	8192UL
+
+#define MAX_LOCKDEP_KEYS_BITS	11
+#define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
+
+#define MAX_LOCKDEP_CHAINS_BITS	13
+#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+#define MAX_STACK_TRACE_ENTRIES	131072UL
+
+extern struct list_head all_lock_classes;
+
+extern void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+
+extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+
+extern unsigned long nr_lock_classes;
+extern unsigned long nr_list_entries;
+extern unsigned long nr_lock_chains;
+extern unsigned long nr_stack_trace_entries;
+
+extern unsigned int nr_hardirq_chains;
+extern unsigned int nr_softirq_chains;
+extern unsigned int nr_process_chains;
+extern unsigned int max_lockdep_depth;
+extern unsigned int max_recursion_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Various lockdep statistics:
+ */
+extern atomic_t chain_lookup_hits;
+extern atomic_t chain_lookup_misses;
+extern atomic_t hardirqs_on_events;
+extern atomic_t hardirqs_off_events;
+extern atomic_t redundant_hardirqs_on;
+extern atomic_t redundant_hardirqs_off;
+extern atomic_t softirqs_on_events;
+extern atomic_t softirqs_off_events;
+extern atomic_t redundant_softirqs_on;
+extern atomic_t redundant_softirqs_off;
+extern atomic_t nr_unused_locks;
+extern atomic_t nr_cyclic_checks;
+extern atomic_t nr_cyclic_check_recursions;
+extern atomic_t nr_find_usage_forwards_checks;
+extern atomic_t nr_find_usage_forwards_recursions;
+extern atomic_t nr_find_usage_backwards_checks;
+extern atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr)		atomic_inc(ptr)
+# define debug_atomic_dec(ptr)		atomic_dec(ptr)
+# define debug_atomic_read(ptr)		atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr)		do { } while (0)
+# define debug_atomic_dec(ptr)		do { } while (0)
+# define debug_atomic_read(ptr)		0
+#endif
diff --git a/kernel/module.c b/kernel/module.c
index 0351625767b1..35e1b1f859d7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1121,6 +1121,9 @@ static void free_module(struct module *mod)
 	if (mod->percpu)
 		percpu_modfree(mod->percpu);
 
+	/* Free lock-classes: */
+	lockdep_free_key_range(mod->module_core, mod->core_size);
+
 	/* Finally, free the core (containing the module structure) */
 	module_free(mod, mod->module_core);
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 16021b09c184..16c2e98b7638 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -48,7 +48,7 @@ config DEBUG_KERNEL
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
 	range 12 21
-	default 17 if S390
+	default 17 if S390 || LOCKDEP
 	default 16 if X86_NUMAQ || IA64
 	default 15 if SMP
 	default 14
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 5cd05f20bdec..9bdc8d440b2b 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/lockdep.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
@@ -889,9 +890,6 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 #include "locking-selftest-softirq.h"
 // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft)
 
-#define lockdep_reset()
-#define lockdep_reset_lock(x)
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define I_SPINLOCK(x)	lockdep_reset_lock(&lock_##x.dep_map)
 # define I_RWLOCK(x)	lockdep_reset_lock(&rwlock_##x.dep_map)
-- 
cgit v1.2.3


From 4ea2176dfa714882e88180b474e4cbcd888b70af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:53 -0700
Subject: [PATCH] lockdep: prove rwsem locking correctness

Use the lock validator framework to prove rwsem locking correctness.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/rwsem.h       | 35 +++++++++++++++++++------
 include/asm-s390/rwsem.h       | 31 ++++++++++++++++++++--
 include/asm-s390/semaphore.h   |  3 ++-
 include/linux/rwsem-spinlock.h | 23 ++++++++++++++--
 include/linux/rwsem.h          | 59 ++++++++++++++++++------------------------
 kernel/Makefile                |  2 +-
 kernel/rwsem.c                 | 42 ++++++++++++++++++++++++++++++
 lib/rwsem-spinlock.c           | 20 +++++++++++---
 lib/rwsem.c                    | 20 ++++++++++++++
 9 files changed, 184 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h
index 558804e4a039..2f07601562e7 100644
--- a/include/asm-i386/rwsem.h
+++ b/include/asm-i386/rwsem.h
@@ -40,6 +40,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/lockdep.h>
 
 struct rwsem_waiter;
 
@@ -61,21 +62,34 @@ struct rw_semaphore {
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
+#else
+# define __RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
+
 #define __RWSEM_INITIALIZER(name) \
 { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
-	}
+	__RWSEM_DEP_MAP_INIT(name) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
+extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+			 struct lock_class_key *key);
+
+#define init_rwsem(sem)						\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__init_rwsem((sem), #sem, &__key);			\
+} while (0)
 
 /*
  * lock for reading
@@ -128,7 +142,7 @@ LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	int tmp;
 
@@ -152,6 +166,11 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the
 		: "memory", "cc");
 }
 
+static inline void __down_write(struct rw_semaphore *sem)
+{
+	__down_write_nested(sem, 0);
+}
+
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
diff --git a/include/asm-s390/rwsem.h b/include/asm-s390/rwsem.h
index 0422a085dd56..13ec16965150 100644
--- a/include/asm-s390/rwsem.h
+++ b/include/asm-s390/rwsem.h
@@ -61,6 +61,9 @@ struct rw_semaphore {
 	signed long		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 };
 
 #ifndef __s390x__
@@ -80,8 +83,16 @@ struct rw_semaphore {
 /*
  * initialisation
  */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
+#else
+# define __RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
 #define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) }
+{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
+  __RWSEM_DEP_MAP_INIT(name) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -93,6 +104,17 @@ static inline void init_rwsem(struct rw_semaphore *sem)
 	INIT_LIST_HEAD(&sem->wait_list);
 }
 
+extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+			 struct lock_class_key *key);
+
+#define init_rwsem(sem)				\
+do {						\
+	static struct lock_class_key __key;	\
+						\
+	__init_rwsem((sem), #sem, &__key);	\
+} while (0)
+
+
 /*
  * lock for reading
  */
@@ -155,7 +177,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	signed long old, new, tmp;
 
@@ -181,6 +203,11 @@ static inline void __down_write(struct rw_semaphore *sem)
 		rwsem_down_write_failed(sem);
 }
 
+static inline void __down_write(struct rw_semaphore *sem)
+{
+	__down_write_nested(sem, 0);
+}
+
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
diff --git a/include/asm-s390/semaphore.h b/include/asm-s390/semaphore.h
index 702cf436698c..32cdc69f39f4 100644
--- a/include/asm-s390/semaphore.h
+++ b/include/asm-s390/semaphore.h
@@ -37,7 +37,8 @@ struct semaphore {
 
 static inline void sema_init (struct semaphore *sem, int val)
 {
-	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER((*sem),val);
+	atomic_set(&sem->count, val);
+	init_waitqueue_head(&sem->wait);
 }
 
 static inline void init_MUTEX (struct semaphore *sem)
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index d68afcc36ac9..ae1fcadd598e 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -32,18 +32,37 @@ struct rw_semaphore {
 	__s32			activity;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
+#else
+# define __RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
 #define __RWSEM_INITIALIZER(name) \
-{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) }
+{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern void FASTCALL(init_rwsem(struct rw_semaphore *sem));
+extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+			 struct lock_class_key *key);
+
+#define init_rwsem(sem)						\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__init_rwsem((sem), #sem, &__key);			\
+} while (0)
+
 extern void FASTCALL(__down_read(struct rw_semaphore *sem));
 extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem));
 extern void FASTCALL(__down_write(struct rw_semaphore *sem));
+extern void FASTCALL(__down_write_nested(struct rw_semaphore *sem, int subclass));
 extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem));
 extern void FASTCALL(__up_read(struct rw_semaphore *sem));
 extern void FASTCALL(__up_write(struct rw_semaphore *sem));
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 93581534b915..658afb37c3f5 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -27,64 +27,55 @@ struct rw_semaphore;
 /*
  * lock for reading
  */
-static inline void down_read(struct rw_semaphore *sem)
-{
-	might_sleep();
-	__down_read(sem);
-}
+extern void down_read(struct rw_semaphore *sem);
 
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int down_read_trylock(struct rw_semaphore *sem)
-{
-	int ret;
-	ret = __down_read_trylock(sem);
-	return ret;
-}
+extern int down_read_trylock(struct rw_semaphore *sem);
 
 /*
  * lock for writing
  */
-static inline void down_write(struct rw_semaphore *sem)
-{
-	might_sleep();
-	__down_write(sem);
-}
+extern void down_write(struct rw_semaphore *sem);
 
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int down_write_trylock(struct rw_semaphore *sem)
-{
-	int ret;
-	ret = __down_write_trylock(sem);
-	return ret;
-}
+extern int down_write_trylock(struct rw_semaphore *sem);
 
 /*
  * release a read lock
  */
-static inline void up_read(struct rw_semaphore *sem)
-{
-	__up_read(sem);
-}
+extern void up_read(struct rw_semaphore *sem);
 
 /*
  * release a write lock
  */
-static inline void up_write(struct rw_semaphore *sem)
-{
-	__up_write(sem);
-}
+extern void up_write(struct rw_semaphore *sem);
 
 /*
  * downgrade write lock to read lock
  */
-static inline void downgrade_write(struct rw_semaphore *sem)
-{
-	__downgrade_write(sem);
-}
+extern void downgrade_write(struct rw_semaphore *sem);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * nested locking:
+ */
+extern void down_read_nested(struct rw_semaphore *sem, int subclass);
+extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+/*
+ * Take/release a lock when not the owner will release it:
+ */
+extern void down_read_non_owner(struct rw_semaphore *sem);
+extern void up_read_non_owner(struct rw_semaphore *sem);
+#else
+# define down_read_nested(sem, subclass)		down_read(sem)
+# define down_write_nested(sem, subclass)	down_write(sem)
+# define down_read_non_owner(sem)		down_read(sem)
+# define up_read_non_owner(sem)			up_read(sem)
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 3dd994eedc5c..df6ef3263699 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o
+	    hrtimer.o rwsem.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 790a99bb25aa..291ded556aa0 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -103,3 +103,45 @@ void downgrade_write(struct rw_semaphore *sem)
 }
 
 EXPORT_SYMBOL(downgrade_write);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	__down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_nested);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+	might_sleep();
+
+	__down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
+void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	__down_write_nested(sem, subclass);
+}
+
+EXPORT_SYMBOL(down_write_nested);
+
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+	__up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
+#endif
+
+
diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 03b6097eb04e..db4fed74b940 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -20,8 +20,16 @@ struct rwsem_waiter {
 /*
  * initialise the semaphore
  */
-void fastcall init_rwsem(struct rw_semaphore *sem)
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+		  struct lock_class_key *key)
 {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held semaphore:
+	 */
+	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+	lockdep_init_map(&sem->dep_map, name, key);
+#endif
 	sem->activity = 0;
 	spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
@@ -183,7 +191,7 @@ int fastcall __down_read_trylock(struct rw_semaphore *sem)
  * get a write lock on the semaphore
  * - we increment the waiting count anyway to indicate an exclusive lock
  */
-void fastcall __sched __down_write(struct rw_semaphore *sem)
+void fastcall __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -223,6 +231,11 @@ void fastcall __sched __down_write(struct rw_semaphore *sem)
 	;
 }
 
+void fastcall __sched __down_write(struct rw_semaphore *sem)
+{
+	__down_write_nested(sem, 0);
+}
+
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
@@ -292,9 +305,10 @@ void fastcall __downgrade_write(struct rw_semaphore *sem)
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 }
 
-EXPORT_SYMBOL(init_rwsem);
+EXPORT_SYMBOL(__init_rwsem);
 EXPORT_SYMBOL(__down_read);
 EXPORT_SYMBOL(__down_read_trylock);
+EXPORT_SYMBOL(__down_write_nested);
 EXPORT_SYMBOL(__down_write);
 EXPORT_SYMBOL(__down_write_trylock);
 EXPORT_SYMBOL(__up_read);
diff --git a/lib/rwsem.c b/lib/rwsem.c
index bae597284889..b322421c2969 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -8,6 +8,26 @@
 #include <linux/init.h>
 #include <linux/module.h>
 
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+		  struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held semaphore:
+	 */
+	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+	lockdep_init_map(&sem->dep_map, name, key);
+#endif
+	sem->count = RWSEM_UNLOCKED_VALUE;
+	spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->wait_list);
+}
+
+EXPORT_SYMBOL(__init_rwsem);
+
 struct rwsem_waiter {
 	struct list_head list;
 	struct task_struct *task;
-- 
cgit v1.2.3


From 8a25d5debff2daee280e83e09d8c25d67c26a972 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:54 -0700
Subject: [PATCH] lockdep: prove spinlock rwlock locking correctness

Use the lock validator framework to prove spinlock and rwlock locking
correctness.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/spinlock.h       |  7 ++++
 include/linux/spinlock.h          | 63 ++++++++++++++++++++++---------
 include/linux/spinlock_api_smp.h  |  2 +
 include/linux/spinlock_api_up.h   |  1 +
 include/linux/spinlock_types.h    | 32 ++++++++++++++--
 include/linux/spinlock_types_up.h |  9 ++++-
 include/linux/spinlock_up.h       |  1 -
 kernel/Makefile                   |  1 +
 kernel/sched.c                    | 10 +++++
 kernel/spinlock.c                 | 79 ++++++++++++++++++++++++++++++++++-----
 lib/kernel_lock.c                 |  7 +++-
 lib/spinlock_debug.c              | 36 ++++++++++++++++++
 net/ipv4/route.c                  |  3 +-
 13 files changed, 217 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h
index 7e29b51bcaa0..87c40f830653 100644
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -68,6 +68,12 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock)
 		"=m" (lock->slock) : : "memory");
 }
 
+/*
+ * It is easier for the lock validator if interrupts are not re-enabled
+ * in the middle of a lock-acquire. This is a performance feature anyway
+ * so we turn it off:
+ */
+#ifndef CONFIG_PROVE_LOCKING
 static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
 {
 	alternative_smp(
@@ -75,6 +81,7 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
 		__raw_spin_lock_string_up,
 		"=m" (lock->slock) : "r" (flags) : "memory");
 }
+#endif
 
 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index ae23beef9cc9..31473db92d3b 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -82,14 +82,40 @@ extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
 /*
  * Pull the __raw*() functions/declarations (UP-nondebug doesnt need them):
  */
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 # include <asm/spinlock.h>
 #else
 # include <linux/spinlock_up.h>
 #endif
 
-#define spin_lock_init(lock)	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
-#define rwlock_init(lock)	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
+#ifdef CONFIG_DEBUG_SPINLOCK
+  extern void __spin_lock_init(spinlock_t *lock, const char *name,
+			       struct lock_class_key *key);
+# define spin_lock_init(lock)					\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__spin_lock_init((lock), #lock, &__key);		\
+} while (0)
+
+#else
+# define spin_lock_init(lock)					\
+	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+  extern void __rwlock_init(rwlock_t *lock, const char *name,
+			    struct lock_class_key *key);
+# define rwlock_init(lock)					\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__rwlock_init((lock), #lock, &__key);			\
+} while (0)
+#else
+# define rwlock_init(lock)					\
+	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
+#endif
 
 #define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
 
@@ -113,7 +139,6 @@ extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
 #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
  extern int _raw_spin_trylock(spinlock_t *lock);
  extern void _raw_spin_unlock(spinlock_t *lock);
-
  extern void _raw_read_lock(rwlock_t *lock);
  extern int _raw_read_trylock(rwlock_t *lock);
  extern void _raw_read_unlock(rwlock_t *lock);
@@ -121,17 +146,17 @@ extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
  extern int _raw_write_trylock(rwlock_t *lock);
  extern void _raw_write_unlock(rwlock_t *lock);
 #else
-# define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
-# define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
 # define _raw_spin_lock(lock)		__raw_spin_lock(&(lock)->raw_lock)
 # define _raw_spin_lock_flags(lock, flags) \
 		__raw_spin_lock_flags(&(lock)->raw_lock, *(flags))
+# define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
+# define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_read_lock(rwlock)		__raw_read_lock(&(rwlock)->raw_lock)
-# define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
-# define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
-# define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
+# define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
+# define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
+# define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
 
 #define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
@@ -147,6 +172,13 @@ extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
 #define write_trylock(lock)		__cond_lock(_write_trylock(lock))
 
 #define spin_lock(lock)			_spin_lock(lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass)
+#else
+# define spin_lock_nested(lock, subclass) _spin_lock(lock)
+#endif
+
 #define write_lock(lock)		_write_lock(lock)
 #define read_lock(lock)			_read_lock(lock)
 
@@ -172,21 +204,18 @@ extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
 /*
  * We inline the unlock functions in the nondebug case:
  */
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
+#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \
+	!defined(CONFIG_SMP)
 # define spin_unlock(lock)		_spin_unlock(lock)
 # define read_unlock(lock)		_read_unlock(lock)
 # define write_unlock(lock)		_write_unlock(lock)
-#else
-# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
-# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
-# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
-#endif
-
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
 # define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
 # define read_unlock_irq(lock)		_read_unlock_irq(lock)
 # define write_unlock_irq(lock)		_write_unlock_irq(lock)
 #else
+# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
+# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
+# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
 # define spin_unlock_irq(lock) \
     do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
 # define read_unlock_irq(lock) \
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 78e6989ffb54..b2c4f8299464 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -20,6 +20,8 @@ int in_lock_functions(unsigned long addr);
 #define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
 
 void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(spinlock_t);
+void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+							__acquires(spinlock_t);
 void __lockfunc _read_lock(rwlock_t *lock)		__acquires(rwlock_t);
 void __lockfunc _write_lock(rwlock_t *lock)		__acquires(rwlock_t);
 void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index cd81cee566f4..67faa044c5f5 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -49,6 +49,7 @@
   do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
 
 #define _spin_lock(lock)			__LOCK(lock)
+#define _spin_lock_nested(lock, subclass)	__LOCK(lock)
 #define _read_lock(lock)			__LOCK(lock)
 #define _write_lock(lock)			__LOCK(lock)
 #define _spin_lock_bh(lock)			__LOCK_BH(lock)
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index f5d4ed7bc785..dc5fb69e4de9 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,6 +9,8 @@
  * Released under the General Public License (GPL).
  */
 
+#include <linux/lockdep.h>
+
 #if defined(CONFIG_SMP)
 # include <asm/spinlock_types.h>
 #else
@@ -24,6 +26,9 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 } spinlock_t;
 
 #define SPINLOCK_MAGIC		0xdead4ead
@@ -37,28 +42,47 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 } rwlock_t;
 
 #define RWLOCK_MAGIC		0xdeaf1eed
 
 #define SPINLOCK_OWNER_INIT	((void *)-1L)
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define SPIN_DEP_MAP_INIT(lockname)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define RW_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define RW_DEP_MAP_INIT(lockname)
+#endif
+
 #ifdef CONFIG_DEBUG_SPINLOCK
 # define __SPIN_LOCK_UNLOCKED(lockname)					\
 	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
 				.magic = SPINLOCK_MAGIC,		\
 				.owner = SPINLOCK_OWNER_INIT,		\
-				.owner_cpu = -1 }
+				.owner_cpu = -1,			\
+				SPIN_DEP_MAP_INIT(lockname) }
 #define __RW_LOCK_UNLOCKED(lockname)					\
 	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				.magic = RWLOCK_MAGIC,			\
 				.owner = SPINLOCK_OWNER_INIT,		\
-				.owner_cpu = -1 }
+				.owner_cpu = -1,			\
+				RW_DEP_MAP_INIT(lockname) }
 #else
 # define __SPIN_LOCK_UNLOCKED(lockname) \
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				SPIN_DEP_MAP_INIT(lockname) }
 #define __RW_LOCK_UNLOCKED(lockname) \
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+				RW_DEP_MAP_INIT(lockname) }
 #endif
 
 #define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(old_style_spin_init)
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index 04135b0e198e..27644af20b7c 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -12,10 +12,14 @@
  * Released under the General Public License (GPL).
  */
 
-#ifdef CONFIG_DEBUG_SPINLOCK
+#if defined(CONFIG_DEBUG_SPINLOCK) || \
+	defined(CONFIG_DEBUG_LOCK_ALLOC)
 
 typedef struct {
 	volatile unsigned int slock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 } raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { 1 }
@@ -30,6 +34,9 @@ typedef struct { } raw_spinlock_t;
 
 typedef struct {
 	/* no debug version on UP */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 } raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 31accf2f0b13..ea54c4c9a4ec 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -18,7 +18,6 @@
  */
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-
 #define __raw_spin_is_locked(x)		((x)->slock == 0)
 
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
diff --git a/kernel/Makefile b/kernel/Makefile
index df6ef3263699..47dbcd570cd8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 911829966534..ae4db0185bb2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -308,6 +308,13 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
+	/*
+	 * If we are tracking spinlock dependencies then we have to
+	 * fix up the runqueue lock - which gets 'carried over' from
+	 * prev into current:
+	 */
+	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
 	spin_unlock_irq(&rq->lock);
 }
 
@@ -1778,6 +1785,7 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
 		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
 	}
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
@@ -4384,6 +4392,7 @@ asmlinkage long sys_sched_yield(void)
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 
@@ -4447,6 +4456,7 @@ int cond_resched_lock(spinlock_t *lock)
 		spin_lock(lock);
 	}
 	if (need_resched() && __resched_legal()) {
+		spin_release(&lock->dep_map, 1, _THIS_IP_);
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
 		__cond_resched();
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index b31e54eadf56..bfd6ad9c0330 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -13,6 +13,7 @@
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/debug_locks.h>
 #include <linux/module.h>
 
 /*
@@ -29,8 +30,10 @@ EXPORT_SYMBOL(generic__raw_read_trylock);
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
+	}
 	
 	preempt_enable();
 	return 0;
@@ -40,8 +43,10 @@ EXPORT_SYMBOL(_spin_trylock);
 int __lockfunc _read_trylock(rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_read_trylock(lock))
+	if (_raw_read_trylock(lock)) {
+		rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
+	}
 
 	preempt_enable();
 	return 0;
@@ -51,19 +56,28 @@ EXPORT_SYMBOL(_read_trylock);
 int __lockfunc _write_trylock(rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_write_trylock(lock))
+	if (_raw_write_trylock(lock)) {
+		rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
+	}
 
 	preempt_enable();
 	return 0;
 }
 EXPORT_SYMBOL(_write_trylock);
 
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
+	defined(CONFIG_PROVE_LOCKING)
 
 void __lockfunc _read_lock(rwlock_t *lock)
 {
 	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_read_lock(lock);
 }
 EXPORT_SYMBOL(_read_lock);
@@ -74,7 +88,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 
 	local_irq_save(flags);
 	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	/*
+	 * On lockdep we dont want the hand-coded irq-enable of
+	 * _raw_spin_lock_flags() code, because lockdep assumes
+	 * that interrupts are not re-enabled during lock-acquire:
+	 */
+#ifdef CONFIG_PROVE_LOCKING
+	_raw_spin_lock(lock);
+#else
 	_raw_spin_lock_flags(lock, &flags);
+#endif
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
@@ -83,6 +107,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_spin_lock(lock);
 }
 EXPORT_SYMBOL(_spin_lock_irq);
@@ -91,6 +116,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_spin_lock(lock);
 }
 EXPORT_SYMBOL(_spin_lock_bh);
@@ -101,6 +127,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 
 	local_irq_save(flags);
 	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_read_lock(lock);
 	return flags;
 }
@@ -110,6 +137,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_read_lock(lock);
 }
 EXPORT_SYMBOL(_read_lock_irq);
@@ -118,6 +146,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_read_lock(lock);
 }
 EXPORT_SYMBOL(_read_lock_bh);
@@ -128,6 +157,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 
 	local_irq_save(flags);
 	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_write_lock(lock);
 	return flags;
 }
@@ -137,6 +167,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_write_lock(lock);
 }
 EXPORT_SYMBOL(_write_lock_irq);
@@ -145,6 +176,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_write_lock(lock);
 }
 EXPORT_SYMBOL(_write_lock_bh);
@@ -152,6 +184,7 @@ EXPORT_SYMBOL(_write_lock_bh);
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_spin_lock(lock);
 }
 
@@ -160,6 +193,7 @@ EXPORT_SYMBOL(_spin_lock);
 void __lockfunc _write_lock(rwlock_t *lock)
 {
 	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_write_lock(lock);
 }
 
@@ -255,8 +289,22 @@ BUILD_LOCK_OPS(write, rwlock);
 
 #endif /* CONFIG_PREEMPT */
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	preempt_disable();
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	_raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock_nested);
+
+#endif
+
 void __lockfunc _spin_unlock(spinlock_t *lock)
 {
+	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	preempt_enable();
 }
@@ -264,6 +312,7 @@ EXPORT_SYMBOL(_spin_unlock);
 
 void __lockfunc _write_unlock(rwlock_t *lock)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	preempt_enable();
 }
@@ -271,6 +320,7 @@ EXPORT_SYMBOL(_write_unlock);
 
 void __lockfunc _read_unlock(rwlock_t *lock)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	preempt_enable();
 }
@@ -278,6 +328,7 @@ EXPORT_SYMBOL(_read_unlock);
 
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
+	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
@@ -286,6 +337,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore);
 
 void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 {
+	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
@@ -294,14 +346,16 @@ EXPORT_SYMBOL(_spin_unlock_irq);
 
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
+	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	preempt_enable_no_resched();
-	local_bh_enable();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
 
 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
@@ -310,6 +364,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore);
 
 void __lockfunc _read_unlock_irq(rwlock_t *lock)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
@@ -318,14 +373,16 @@ EXPORT_SYMBOL(_read_unlock_irq);
 
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	preempt_enable_no_resched();
-	local_bh_enable();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
 EXPORT_SYMBOL(_read_unlock_bh);
 
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
@@ -334,6 +391,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore);
 
 void __lockfunc _write_unlock_irq(rwlock_t *lock)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
@@ -342,9 +400,10 @@ EXPORT_SYMBOL(_write_unlock_irq);
 
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	preempt_enable_no_resched();
-	local_bh_enable();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
 EXPORT_SYMBOL(_write_unlock_bh);
 
@@ -352,11 +411,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
+	}
 
 	preempt_enable_no_resched();
-	local_bh_enable();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 	return 0;
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index e713e86811ae..e0fdfddb406e 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -177,7 +177,12 @@ static inline void __lock_kernel(void)
 
 static inline void __unlock_kernel(void)
 {
-	spin_unlock(&kernel_flag);
+	/*
+	 * the BKL is not covered by lockdep, so we open-code the
+	 * unlocking sequence (and thus avoid the dep-chain ops):
+	 */
+	_raw_spin_unlock(&kernel_flag);
+	preempt_enable();
 }
 
 /*
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index 3de2ccf48ac6..3d9c4dc965ed 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -12,6 +12,42 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 
+void __spin_lock_init(spinlock_t *lock, const char *name,
+		      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key);
+#endif
+	lock->raw_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	lock->magic = SPINLOCK_MAGIC;
+	lock->owner = SPINLOCK_OWNER_INIT;
+	lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__spin_lock_init);
+
+void __rwlock_init(rwlock_t *lock, const char *name,
+		   struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key);
+#endif
+	lock->raw_lock = (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED;
+	lock->magic = RWLOCK_MAGIC;
+	lock->owner = SPINLOCK_OWNER_INIT;
+	lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__rwlock_init);
+
 static void spin_bug(spinlock_t *lock, const char *msg)
 {
 	struct task_struct *owner = NULL;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index da44fabf4dc5..283a72247e5e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -205,7 +205,8 @@ __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
 	struct rtable	*chain;
 };
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+	defined(CONFIG_PROVE_LOCKING)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
-- 
cgit v1.2.3


From ef5d4707b9065c0cf8a69fa3716893f3b75201ba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:55 -0700
Subject: [PATCH] lockdep: prove mutex locking correctness

Use the lock validator framework to prove mutex locking correctness.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mutex-debug.h |  8 +++++++-
 include/linux/mutex.h       | 31 ++++++++++++++++++++++++++++---
 kernel/mutex-debug.c        |  6 +++++-
 kernel/mutex.c              | 28 ++++++++++++++++++++++------
 kernel/mutex.h              |  2 +-
 5 files changed, 63 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
index 70a26091fc73..2537285e1064 100644
--- a/include/linux/mutex-debug.h
+++ b/include/linux/mutex-debug.h
@@ -2,6 +2,7 @@
 #define __LINUX_MUTEX_DEBUG_H
 
 #include <linux/linkage.h>
+#include <linux/lockdep.h>
 
 /*
  * Mutexes - debugging helpers:
@@ -10,7 +11,12 @@
 #define __DEBUG_MUTEX_INITIALIZER(lockname)				\
 	, .magic = &lockname
 
-#define mutex_init(sem)		__mutex_init(sem, __FILE__":"#sem)
+#define mutex_init(mutex)						\
+do {									\
+	static struct lock_class_key __key;				\
+									\
+	__mutex_init((mutex), #mutex, &__key);				\
+} while (0)
 
 extern void FASTCALL(mutex_destroy(struct mutex *lock));
 
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index caafecd5e366..27c48daa3183 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
 #include <linux/linkage.h>
+#include <linux/lockdep.h>
 
 #include <asm/atomic.h>
 
@@ -53,6 +54,9 @@ struct mutex {
 	const char 		*name;
 	void			*magic;
 #endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 };
 
 /*
@@ -72,20 +76,34 @@ struct mutex_waiter {
 # include <linux/mutex-debug.h>
 #else
 # define __DEBUG_MUTEX_INITIALIZER(lockname)
-# define mutex_init(mutex)			__mutex_init(mutex, NULL)
+# define mutex_init(mutex) \
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__mutex_init((mutex), #mutex, &__key);		\
+} while (0)
 # define mutex_destroy(mutex)				do { } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+		, .dep_map = { .name = #lockname }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
+
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .count = ATOMIC_INIT(1) \
 		, .wait_lock = SPIN_LOCK_UNLOCKED \
 		, .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
-		__DEBUG_MUTEX_INITIALIZER(lockname) }
+		__DEBUG_MUTEX_INITIALIZER(lockname) \
+		__DEP_MAP_MUTEX_INITIALIZER(lockname) }
 
 #define DEFINE_MUTEX(mutexname) \
 	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
 
-extern void fastcall __mutex_init(struct mutex *lock, const char *name);
+extern void __mutex_init(struct mutex *lock, const char *name,
+			 struct lock_class_key *key);
 
 /***
  * mutex_is_locked - is the mutex locked
@@ -104,6 +122,13 @@ static inline int fastcall mutex_is_locked(struct mutex *lock)
  */
 extern void fastcall mutex_lock(struct mutex *lock);
 extern int fastcall mutex_lock_interruptible(struct mutex *lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+#else
+# define mutex_lock_nested(lock, subclass) mutex_lock(lock)
+#endif
+
 /*
  * NOTE: mutex_trylock() follows the spin_trylock() convention,
  *       not the down_trylock() convention!
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 5569766a1ea2..e3203c654dda 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -83,12 +83,16 @@ void debug_mutex_unlock(struct mutex *lock)
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 }
 
-void debug_mutex_init(struct mutex *lock, const char *name)
+void debug_mutex_init(struct mutex *lock, const char *name,
+		      struct lock_class_key *key)
 {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/*
 	 * Make sure we are not reinitializing a held lock:
 	 */
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key);
+#endif
 	lock->owner = NULL;
 	lock->magic = lock;
 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 43a50c18701a..8c71cf72a497 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -39,13 +39,14 @@
  *
  * It is not allowed to initialize an already locked mutex.
  */
-__always_inline void fastcall __mutex_init(struct mutex *lock, const char *name)
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
 
-	debug_mutex_init(lock, name);
+	debug_mutex_init(lock, name, key);
 }
 
 EXPORT_SYMBOL(__mutex_init);
@@ -131,6 +132,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	debug_mutex_add_waiter(lock, &waiter, task->thread_info);
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
@@ -158,6 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 		if (unlikely(state == TASK_INTERRUPTIBLE &&
 						signal_pending(task))) {
 			mutex_remove_waiter(lock, &waiter, task->thread_info);
+			mutex_release(&lock->dep_map, 1, _RET_IP_);
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
@@ -194,16 +197,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+	might_sleep();
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+#endif
+
 /*
  * Release the lock, slowpath:
  */
 static fastcall inline void
-__mutex_unlock_common_slowpath(atomic_t *lock_count)
+__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
 	unsigned long flags;
 
 	spin_lock_mutex(&lock->wait_lock, flags);
+	mutex_release(&lock->dep_map, nested, _RET_IP_);
 	debug_mutex_unlock(lock);
 
 	/*
@@ -236,7 +251,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count)
 static fastcall noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
-	__mutex_unlock_common_slowpath(lock_count);
+	__mutex_unlock_common_slowpath(lock_count, 1);
 }
 
 /*
@@ -287,9 +302,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	prev = atomic_xchg(&lock->count, -1);
-	if (likely(prev == 1))
+	if (likely(prev == 1)) {
 		debug_mutex_set_owner(lock, current_thread_info());
-
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	}
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index aeb2d916aa0e..a075dafbb290 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -22,7 +22,7 @@
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
 #define debug_mutex_unlock(lock)			do { } while (0)
-#define debug_mutex_init(lock, name)			do { } while (0)
+#define debug_mutex_init(lock, name, key)		do { } while (0)
 
 static inline void
 debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
-- 
cgit v1.2.3


From a90b9c05df3c1e58eaedc28795d0f5abd896c098 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:04 -0700
Subject: [PATCH] lockdep: annotate dcache

Teach special (recursive) locking code to the lock validator.  Has no effect
on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c            |  4 ++--
 include/linux/dcache.h | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index bec4de176c81..1b4a3a34ec57 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1339,10 +1339,10 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	 */
 	if (target < dentry) {
 		spin_lock(&target->d_lock);
-		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	} else {
 		spin_lock(&dentry->d_lock);
-		spin_lock(&target->d_lock);
+		spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
 	}
 
 	/* Move the dentry to the target hash queue, if on different bucket */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 0dd1610a94a9..471781ffeab1 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -114,6 +114,18 @@ struct dentry {
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
 };
 
+/*
+ * dentry->d_lock spinlock nesting subclasses:
+ *
+ * 0: normal
+ * 1: nested
+ */
+enum dentry_d_lock_class
+{
+	DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
+	DENTRY_D_LOCK_NESTED
+};
+
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
 	int (*d_hash) (struct dentry *, struct qstr *);
-- 
cgit v1.2.3


From f2eace23e924bd3f05aedea4fc505eb5508d2d93 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:05 -0700
Subject: [PATCH] lockdep: annotate i_mutex

Teach special (recursive) locking code to the lock validator.  Has no effect
on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/inode.c |  2 +-
 fs/namei.c               | 20 ++++++++++----------
 include/linux/fs.h       | 19 +++++++++++++++++++
 3 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index e47e3a8ed6e4..cf11196e012d 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -200,7 +200,7 @@ static void update_sb(struct super_block *sb)
 	if (!root)
 		return;
 
-	mutex_lock(&root->d_inode->i_mutex);
+	mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
 
 	list_for_each_entry(bus, &root->d_subdirs, d_u.d_child) {
 		if (bus->d_inode) {
diff --git a/fs/namei.c b/fs/namei.c
index c784e8bb57a3..c9750d755aff 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1423,7 +1423,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	struct dentry *p;
 
 	if (p1 == p2) {
-		mutex_lock(&p1->d_inode->i_mutex);
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
 		return NULL;
 	}
 
@@ -1431,22 +1431,22 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	for (p = p1; p->d_parent != p; p = p->d_parent) {
 		if (p->d_parent == p2) {
-			mutex_lock(&p2->d_inode->i_mutex);
-			mutex_lock(&p1->d_inode->i_mutex);
+			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
 			return p;
 		}
 	}
 
 	for (p = p2; p->d_parent != p; p = p->d_parent) {
 		if (p->d_parent == p1) {
-			mutex_lock(&p1->d_inode->i_mutex);
-			mutex_lock(&p2->d_inode->i_mutex);
+			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
 			return p;
 		}
 	}
 
-	mutex_lock(&p1->d_inode->i_mutex);
-	mutex_lock(&p2->d_inode->i_mutex);
+	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
 	return NULL;
 }
 
@@ -1751,7 +1751,7 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 {
 	struct dentry *dentry = ERR_PTR(-EEXIST);
 
-	mutex_lock(&nd->dentry->d_inode->i_mutex);
+	mutex_lock_nested(&nd->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	/*
 	 * Yucky last component or no last component at all?
 	 * (foo/., foo/.., /////)
@@ -2008,7 +2008,7 @@ static long do_rmdir(int dfd, const char __user *pathname)
 			error = -EBUSY;
 			goto exit1;
 	}
-	mutex_lock(&nd.dentry->d_inode->i_mutex);
+	mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
@@ -2082,7 +2082,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM)
 		goto exit1;
-	mutex_lock(&nd.dentry->d_inode->i_mutex);
+	mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e04a5cfe874f..05ded9e76b23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -542,6 +542,25 @@ struct inode {
 #endif
 };
 
+/*
+ * inode->i_mutex nesting subclasses for the lock validator:
+ *
+ * 0: the object of the current VFS operation
+ * 1: parent
+ * 2: child/target
+ * 3: quota file
+ *
+ * The locking order between these classes is
+ * parent -> child -> normal -> quota
+ */
+enum inode_i_mutex_lock_class
+{
+	I_MUTEX_NORMAL,
+	I_MUTEX_PARENT,
+	I_MUTEX_CHILD,
+	I_MUTEX_QUOTA
+};
+
 /*
  * NOTE: in a 32bit arch with a preemptable kernel and
  * an UP compile the i_size_read/write must be atomic
-- 
cgit v1.2.3


From 243c7621aac4ed1aa79524c9a1cecf7c05a28124 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:06 -0700
Subject: [PATCH] lockdep: annotate genirq

Teach special (recursive) locking code to the lock validator.  Has no effect
on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/lockdep.h |  6 ++++++
 init/main.c             |  1 +
 kernel/irq/handle.c     | 16 ++++++++++++++++
 3 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 80ec7a4dbc98..316e0fb8d7b1 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -270,6 +270,12 @@ static inline int lockdep_internal(void)
 struct lock_class_key { };
 #endif /* !LOCKDEP */
 
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS)
+extern void early_init_irq_lock_class(void);
+#else
+# define early_init_irq_lock_class()		do { } while (0)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS
 extern void early_boot_irqs_off(void);
 extern void early_boot_irqs_on(void);
diff --git a/init/main.c b/init/main.c
index fc473d4b56fd..628b8e9e841a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -466,6 +466,7 @@ asmlinkage void __init start_kernel(void)
 
 	local_irq_disable();
 	early_boot_irqs_off();
+	early_init_irq_lock_class();
 
 /*
  * Interrupts are still disabled. Do necessary setups, then
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aeb6e391276c..a7b497ee919e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -249,3 +249,19 @@ out:
 	return 1;
 }
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+void early_init_irq_lock_class(void)
+{
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
+}
+
+#endif
-- 
cgit v1.2.3


From eb4542b98c81e22e08587b747b21986a45360999 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:07 -0700
Subject: [PATCH] lockdep: annotate waitqueues

Create one lock class for all waitqueue locks in the kernel.  Has no effect on
non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/wait.h | 6 ++++++
 kernel/wait.c        | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index bc4f389c49bb..794be7af58ae 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -77,9 +77,15 @@ struct task_struct;
 #define __WAIT_BIT_KEY_INITIALIZER(word, bit)				\
 	{ .flags = word, .bit_nr = bit, }
 
+/*
+ * lockdep: we want one lock-class for all waitqueue locks.
+ */
+extern struct lock_class_key waitqueue_lock_key;
+
 static inline void init_waitqueue_head(wait_queue_head_t *q)
 {
 	spin_lock_init(&q->lock);
+	lockdep_set_class(&q->lock, &waitqueue_lock_key);
 	INIT_LIST_HEAD(&q->task_list);
 }
 
diff --git a/kernel/wait.c b/kernel/wait.c
index 5985d866531f..a1d57aeb7f75 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,6 +10,10 @@
 #include <linux/wait.h>
 #include <linux/hash.h>
 
+struct lock_class_key waitqueue_lock_key;
+
+EXPORT_SYMBOL(waitqueue_lock_key);
+
 void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From 06825ba3553151eea24206bc53d4fc3de49e0ab1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:09 -0700
Subject: [PATCH] lockdep: annotate skb_queue_head_init

Teach special (multi-initialized) locking code to the lock validator.  Has no
effect on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/skbuff.h | 3 +++
 net/core/skbuff.c      | 7 +++++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 57d7d4965f9a..3597b4f14389 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -604,9 +604,12 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
 	return list_->qlen;
 }
 
+extern struct lock_class_key skb_queue_lock_key;
+
 static inline void skb_queue_head_init(struct sk_buff_head *list)
 {
 	spin_lock_init(&list->lock);
+	lockdep_set_class(&list->lock, &skb_queue_lock_key);
 	list->prev = list->next = (struct sk_buff *)list;
 	list->qlen = 0;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7cfbdb215ba2..44f6a181a754 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -70,6 +70,13 @@
 static kmem_cache_t *skbuff_head_cache __read_mostly;
 static kmem_cache_t *skbuff_fclone_cache __read_mostly;
 
+/*
+ * lockdep: lock class key used by skb_queue_head_init():
+ */
+struct lock_class_key skb_queue_lock_key;
+
+EXPORT_SYMBOL(skb_queue_lock_key);
+
 /*
  *	Keep out-of-line to prevent kernel bloat.
  *	__builtin_return_address is not used because it is not always
-- 
cgit v1.2.3


From 543655244866b8ec648fea1eb9c32a35ffba5721 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:11 -0700
Subject: [PATCH] lockdep: annotate hrtimer base locks

Teach special (recursive) locking code to the lock validator.  Has no effect
on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 1 +
 kernel/hrtimer.c        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 07d7305f131e..e4bccbcc2750 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -91,6 +91,7 @@ struct hrtimer_base {
 	ktime_t			(*get_softirq_time)(void);
 	struct hrtimer		*curr_timer;
 	ktime_t			softirq_time;
+	struct lock_class_key lock_key;
 };
 
 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 8d3dc29ef41a..617304ce67db 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -782,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu)
 	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
 	int i;
 
-	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++)
+	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
 		spin_lock_init(&base->lock);
+		lockdep_set_class(&base->lock, &base->lock_key);
+	}
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v1.2.3


From da21f24dd73954c2ed0cd39a698e2c9916c05d71 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:12 -0700
Subject: [PATCH] lockdep: annotate sock_lock_init()

Teach special (multi-initialized, per-address-family) locking code to the lock
validator.  Has no effect on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/net/sock.h |  4 ++++
 net/core/sock.c    | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 7b3d6b856946..83805feea880 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -80,8 +80,12 @@ typedef struct {
 	wait_queue_head_t	wq;
 } socket_lock_t;
 
+extern struct lock_class_key af_family_keys[AF_MAX];
+
 #define sock_lock_init(__sk) \
 do {	spin_lock_init(&((__sk)->sk_lock.slock)); \
+	lockdep_set_class(&(__sk)->sk_lock.slock, \
+			  af_family_keys + (__sk)->sk_family); \
 	(__sk)->sk_lock.owner = NULL; \
 	init_waitqueue_head(&((__sk)->sk_lock.wq)); \
 } while(0)
diff --git a/net/core/sock.c b/net/core/sock.c
index 533b9317144b..0b4d5d25b23c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -129,6 +129,18 @@
 #include <net/tcp.h>
 #endif
 
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family:
+ */
+struct lock_class_key af_family_keys[AF_MAX];
+
+/*
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+
 /* Take into consideration the size of the struct sk_buff overhead in the
  * determination of these values, since that is non-constant across
  * platforms.  This makes socket queueing behavior and performance
@@ -848,6 +860,8 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 
 		rwlock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
+		lockdep_set_class(&newsk->sk_callback_lock,
+				   af_callback_keys + newsk->sk_family);
 
 		newsk->sk_dst_cache	= NULL;
 		newsk->sk_wmem_queued	= 0;
@@ -1422,6 +1436,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	rwlock_init(&sk->sk_dst_lock);
 	rwlock_init(&sk->sk_callback_lock);
+	lockdep_set_class(&sk->sk_callback_lock,
+			   af_callback_keys + sk->sk_family);
 
 	sk->sk_state_change	=	sock_def_wakeup;
 	sk->sk_data_ready	=	sock_def_readable;
-- 
cgit v1.2.3


From a09785a2414afb261d9f719d544742af4300df22 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:12 -0700
Subject: [PATCH] lockdep: annotate af_unix locking

Teach special (recursive) locking code to the lock validator.  Also splits
af_unix's sk_receive_queue.lock class from the other networking skb-queue
locks.  Has no effect on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/net/af_unix.h |  3 +++
 net/unix/af_unix.c    | 12 +++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 5ba72d95280c..2fec827c8801 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -67,6 +67,9 @@ struct unix_skb_parms {
 #define unix_state_rlock(s)	spin_lock(&unix_sk(s)->lock)
 #define unix_state_runlock(s)	spin_unlock(&unix_sk(s)->lock)
 #define unix_state_wlock(s)	spin_lock(&unix_sk(s)->lock)
+#define unix_state_wlock_nested(s) \
+				spin_lock_nested(&unix_sk(s)->lock, \
+				SINGLE_DEPTH_NESTING)
 #define unix_state_wunlock(s)	spin_unlock(&unix_sk(s)->lock)
 
 #ifdef __KERNEL__
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index aca650109425..e9a287bc3142 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -565,6 +565,14 @@ static struct proto unix_proto = {
 	.obj_size = sizeof(struct unix_sock),
 };
 
+/*
+ * AF_UNIX sockets do not interact with hardware, hence they
+ * dont trigger interrupts - so it's safe for them to have
+ * bh-unsafe locking for their sk_receive_queue.lock. Split off
+ * this special lock-class by reinitializing the spinlock key:
+ */
+static struct lock_class_key af_unix_sk_receive_queue_lock_key;
+
 static struct sock * unix_create1(struct socket *sock)
 {
 	struct sock *sk = NULL;
@@ -580,6 +588,8 @@ static struct sock * unix_create1(struct socket *sock)
 	atomic_inc(&unix_nr_socks);
 
 	sock_init_data(sock,sk);
+	lockdep_set_class(&sk->sk_receive_queue.lock,
+				&af_unix_sk_receive_queue_lock_key);
 
 	sk->sk_write_space	= unix_write_space;
 	sk->sk_max_ack_backlog	= sysctl_unix_max_dgram_qlen;
@@ -1045,7 +1055,7 @@ restart:
 		goto out_unlock;
 	}
 
-	unix_state_wlock(sk);
+	unix_state_wlock_nested(sk);
 
 	if (sk->sk_state != st) {
 		unix_state_wunlock(sk);
-- 
cgit v1.2.3


From c63661848581a9842dfc72d9a400285dd284fc47 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:13 -0700
Subject: [PATCH] lockdep: annotate bh_lock_sock()

Teach special (recursive) locking code to the lock validator.  Has no effect
on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/net/sock.h  | 3 +++
 net/ipv4/tcp_ipv4.c | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 83805feea880..0969fb60d6ea 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -751,6 +751,9 @@ extern void FASTCALL(release_sock(struct sock *sk));
 
 /* BH context may only use the following locking interface. */
 #define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
+#define bh_lock_sock_nested(__sk) \
+				spin_lock_nested(&((__sk)->sk_lock.slock), \
+				SINGLE_DEPTH_NESTING)
 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
 
 extern struct sock		*sk_alloc(int family,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 823717285c6d..5a886e6efbbe 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1090,7 +1090,7 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock(sk);
+	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-- 
cgit v1.2.3


From 366c7f554e888e51b8395f9b07b273fe775c7ff3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:25 -0700
Subject: [PATCH] lockdep: annotate enable_in_hardirq()

Make use of local_irq_enable_in_hardirq() API to annotate places that enable
hardirqs in hardirq context.

Has no effect on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c     | 2 +-
 arch/x86_64/kernel/nmi.c   | 2 +-
 drivers/ide/ide-floppy.c   | 2 +-
 drivers/ide/ide-io.c       | 6 +++---
 drivers/ide/ide-taskfile.c | 2 +-
 include/linux/ide.h        | 2 +-
 kernel/irq/handle.c        | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index a76e93146585..2dd928a84645 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -107,7 +107,7 @@ int nmi_active;
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
-	local_irq_enable();
+	local_irq_enable_in_hardirq();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 476c1472fc07..5baa0c726e97 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -127,7 +127,7 @@ void __cpuinit nmi_watchdog_default(void)
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
-	local_irq_enable();
+	local_irq_enable_in_hardirq();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 6ca3476d02c7..adbe9f76a505 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -838,7 +838,7 @@ static ide_startstop_t idefloppy_pc_intr (ide_drive_t *drive)
 			"transferred\n", pc->actually_transferred);
 		clear_bit(PC_DMA_IN_PROGRESS, &pc->flags);
 
-		local_irq_enable();
+		local_irq_enable_in_hardirq();
 
 		if (status.b.check || test_bit(PC_DMA_ERROR, &pc->flags)) {
 			/* Error detected */
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 7dba9992ad30..d6bf1ec96322 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -693,7 +693,7 @@ static ide_startstop_t drive_cmd_intr (ide_drive_t *drive)
 	u8 stat = hwif->INB(IDE_STATUS_REG);
 	int retries = 10;
 
-	local_irq_enable();
+	local_irq_enable_in_hardirq();
 	if ((stat & DRQ_STAT) && args && args[3]) {
 		u8 io_32bit = drive->io_32bit;
 		drive->io_32bit = 0;
@@ -1286,7 +1286,7 @@ static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
 		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
 			disable_irq_nosync(hwif->irq);
 		spin_unlock(&ide_lock);
-		local_irq_enable();
+		local_irq_enable_in_hardirq();
 			/* allow other IRQs while we start this request */
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&ide_lock);
@@ -1631,7 +1631,7 @@ irqreturn_t ide_intr (int irq, void *dev_id, struct pt_regs *regs)
 	spin_unlock(&ide_lock);
 
 	if (drive->unmask)
-		local_irq_enable();
+		local_irq_enable_in_hardirq();
 	/* service this interrupt, may set handler for next interrupt */
 	startstop = handler(drive);
 	spin_lock_irq(&ide_lock);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 04547eb0833f..97a9244312fc 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -222,7 +222,7 @@ ide_startstop_t task_no_data_intr (ide_drive_t *drive)
 	ide_hwif_t *hwif	= HWIF(drive);
 	u8 stat;
 
-	local_irq_enable();
+	local_irq_enable_in_hardirq();
 	if (!OK_STAT(stat = hwif->INB(IDE_STATUS_REG),READY_STAT,BAD_STAT)) {
 		return ide_error(drive, "task_no_data_intr", stat);
 		/* calls ide_end_drive_cmd */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 285316c836b5..dc7abef10965 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1359,7 +1359,7 @@ extern struct semaphore ide_cfg_sem;
  * ide_drive_t->hwif: constant, no locking
  */
 
-#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable(); } while (0)
+#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable_in_hardirq(); } while (0)
 
 extern struct bus_type ide_bus_type;
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a7b497ee919e..fc4e906aedbd 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,7 +132,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 	handle_dynamic_tick(action);
 
 	if (!(action->flags & IRQF_DISABLED))
-		local_irq_enable();
+		local_irq_enable_in_hardirq();
 
 	do {
 		ret = action->handler(irq, action->dev_id, regs);
-- 
cgit v1.2.3


From cf51624999e56c88154b5f7d451a265db6aabff7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:27 -0700
Subject: [PATCH] lockdep: annotate ->s_lock

Teach special (per-filesystem) locking code to the lock validator.

Minimal effect on non-lockdep kernels: one extra parameter to alloc_super().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c         | 10 ++++++++--
 include/linux/fs.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 9b780c42d845..5a4fe8be462a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -53,7 +53,7 @@ DEFINE_SPINLOCK(sb_lock);
  *	Allocates and initializes a new &struct super_block.  alloc_super()
  *	returns a pointer new superblock or %NULL if allocation had failed.
  */
-static struct super_block *alloc_super(void)
+static struct super_block *alloc_super(struct file_system_type *type)
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static struct super_operations default_op;
@@ -72,6 +72,12 @@ static struct super_block *alloc_super(void)
 		INIT_LIST_HEAD(&s->s_inodes);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
+		/*
+		 * The locking rules for s_lock are up to the
+		 * filesystem. For example ext3fs has different
+		 * lock ordering than usbfs:
+		 */
+		lockdep_set_class(&s->s_lock, &type->s_lock_key);
 		down_write(&s->s_umount);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
@@ -295,7 +301,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super();
+		s = alloc_super(type);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 05ded9e76b23..0a3ea52d711e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1295,6 +1295,7 @@ struct file_system_type {
 	struct module *owner;
 	struct file_system_type * next;
 	struct list_head fs_supers;
+	struct lock_class_key s_lock_key;
 };
 
 extern int get_sb_bdev(struct file_system_type *fs_type,
-- 
cgit v1.2.3


From 897c6ff9568bcb102ffc6b465ebe1def0cba829d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Mon, 3 Jul 2006 00:25:28 -0700
Subject: [PATCH] lockdep: annotate sb ->s_umount

The s_umount rwsem needs to be classified as per-superblock since it's
perfectly legit to keep multiple of those recursively in the VFS locking
rules.

Has no effect on non-lockdep kernels.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c         | 1 +
 include/linux/fs.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 5a4fe8be462a..6d4e8174b6db 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -72,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_LIST_HEAD(&s->s_inodes);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
+		lockdep_set_class(&s->s_umount, &type->s_umount_key);
 		/*
 		 * The locking rules for s_lock are up to the
 		 * filesystem. For example ext3fs has different
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0a3ea52d711e..e26de68059af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1296,6 +1296,7 @@ struct file_system_type {
 	struct file_system_type * next;
 	struct list_head fs_supers;
 	struct lock_class_key s_lock_key;
+	struct lock_class_key s_umount_key;
 };
 
 extern int get_sb_bdev(struct file_system_type *fs_type,
-- 
cgit v1.2.3


From 663d440eaa496db903cc58be04b9b602ba45e43b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:33 -0700
Subject: [PATCH] lockdep: annotate blkdev nesting

Teach special (recursive) locking code to the lock validator.

Effects on non-lockdep kernels:

- the introduction of the following function variants:

  extern struct block_device *open_partition_by_devnum(dev_t, unsigned);

  extern int blkdev_put_partition(struct block_device *);

  static int
  blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags);

 which on non-lockdep are the same as open_by_devnum(), blkdev_put()
 and blkdev_get().

- a subclass parameter to do_open(). [unused on non-lockdep]

- a subclass parameter to __blkdev_put(), which is a new internal
  function for the main blkdev_put*() functions. [parameter unused
  on non-lockdep kernels, except for two sanity check WARN_ON()s]

these functions carry no semantical difference - they only express
object dependencies towards the lockdep subsystem.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c    |   6 ++--
 fs/block_dev.c     | 102 +++++++++++++++++++++++++++++++++++++++++++++--------
 include/linux/fs.h |  17 +++++++++
 3 files changed, 107 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2fe32c261922..e4e161372a3e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1404,7 +1404,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
@@ -1414,7 +1414,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
 	if (err) {
 		printk(KERN_ERR "md: could not bd_claim %s.\n",
 			bdevname(bdev, b));
-		blkdev_put(bdev);
+		blkdev_put_partition(bdev);
 		return err;
 	}
 	rdev->bdev = bdev;
@@ -1428,7 +1428,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
 	if (!bdev)
 		MD_BUG();
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put_partition(bdev);
 }
 
 void md_autodetect_dev(dev_t dev);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9633a490dab0..37534573960b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -739,7 +739,7 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
 	if (!bo)
 		return -ENOMEM;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION);
 	res = bd_claim(bdev, holder);
 	if (res || !add_bd_holder(bdev, bo))
 		free_bd_holder(bo);
@@ -764,7 +764,7 @@ static void bd_release_from_kobject(struct block_device *bdev,
 	if (!kobj)
 		return;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION);
 	bd_release(bdev);
 	if ((bo = del_bd_holder(bdev, kobj)))
 		free_bd_holder(bo);
@@ -822,6 +822,22 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 
 EXPORT_SYMBOL(open_by_devnum);
 
+static int
+blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags);
+
+struct block_device *open_partition_by_devnum(dev_t dev, unsigned mode)
+{
+	struct block_device *bdev = bdget(dev);
+	int err = -ENOMEM;
+	int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY;
+	if (bdev)
+		err = blkdev_get_partition(bdev, mode, flags);
+	return err ? ERR_PTR(err) : bdev;
+}
+
+EXPORT_SYMBOL(open_partition_by_devnum);
+
+
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
@@ -868,7 +884,11 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int do_open(struct block_device *bdev, struct file *file)
+static int
+blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags);
+
+static int
+do_open(struct block_device *bdev, struct file *file, unsigned int subclass)
 {
 	struct module *owner = NULL;
 	struct gendisk *disk;
@@ -885,7 +905,8 @@ static int do_open(struct block_device *bdev, struct file *file)
 	}
 	owner = disk->fops->owner;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock_nested(&bdev->bd_mutex, subclass);
+
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
 		bdev->bd_contains = bdev;
@@ -912,11 +933,11 @@ static int do_open(struct block_device *bdev, struct file *file)
 			ret = -ENOMEM;
 			if (!whole)
 				goto out_first;
-			ret = blkdev_get(whole, file->f_mode, file->f_flags);
+			ret = blkdev_get_whole(whole, file->f_mode, file->f_flags);
 			if (ret)
 				goto out_first;
 			bdev->bd_contains = whole;
-			mutex_lock(&whole->bd_mutex);
+			mutex_lock_nested(&whole->bd_mutex, BD_MUTEX_WHOLE);
 			whole->bd_part_count++;
 			p = disk->part[part - 1];
 			bdev->bd_inode->i_data.backing_dev_info =
@@ -944,7 +965,8 @@ static int do_open(struct block_device *bdev, struct file *file)
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
 		} else {
-			mutex_lock(&bdev->bd_contains->bd_mutex);
+			mutex_lock_nested(&bdev->bd_contains->bd_mutex,
+					  BD_MUTEX_PARTITION);
 			bdev->bd_contains->bd_part_count++;
 			mutex_unlock(&bdev->bd_contains->bd_mutex);
 		}
@@ -985,11 +1007,49 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags)
 	fake_file.f_dentry = &fake_dentry;
 	fake_dentry.d_inode = bdev->bd_inode;
 
-	return do_open(bdev, &fake_file);
+	return do_open(bdev, &fake_file, BD_MUTEX_NORMAL);
 }
 
 EXPORT_SYMBOL(blkdev_get);
 
+static int
+blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags)
+{
+	/*
+	 * This crockload is due to bad choice of ->open() type.
+	 * It will go away.
+	 * For now, block device ->open() routine must _not_
+	 * examine anything in 'inode' argument except ->i_rdev.
+	 */
+	struct file fake_file = {};
+	struct dentry fake_dentry = {};
+	fake_file.f_mode = mode;
+	fake_file.f_flags = flags;
+	fake_file.f_dentry = &fake_dentry;
+	fake_dentry.d_inode = bdev->bd_inode;
+
+	return do_open(bdev, &fake_file, BD_MUTEX_WHOLE);
+}
+
+static int
+blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags)
+{
+	/*
+	 * This crockload is due to bad choice of ->open() type.
+	 * It will go away.
+	 * For now, block device ->open() routine must _not_
+	 * examine anything in 'inode' argument except ->i_rdev.
+	 */
+	struct file fake_file = {};
+	struct dentry fake_dentry = {};
+	fake_file.f_mode = mode;
+	fake_file.f_flags = flags;
+	fake_file.f_dentry = &fake_dentry;
+	fake_dentry.d_inode = bdev->bd_inode;
+
+	return do_open(bdev, &fake_file, BD_MUTEX_PARTITION);
+}
+
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
@@ -1005,7 +1065,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 
 	bdev = bd_acquire(inode);
 
-	res = do_open(bdev, filp);
+	res = do_open(bdev, filp, BD_MUTEX_NORMAL);
 	if (res)
 		return res;
 
@@ -1019,13 +1079,13 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	return res;
 }
 
-int blkdev_put(struct block_device *bdev)
+static int __blkdev_put(struct block_device *bdev, unsigned int subclass)
 {
 	int ret = 0;
 	struct inode *bd_inode = bdev->bd_inode;
 	struct gendisk *disk = bdev->bd_disk;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock_nested(&bdev->bd_mutex, subclass);
 	lock_kernel();
 	if (!--bdev->bd_openers) {
 		sync_blockdev(bdev);
@@ -1035,7 +1095,8 @@ int blkdev_put(struct block_device *bdev)
 		if (disk->fops->release)
 			ret = disk->fops->release(bd_inode, NULL);
 	} else {
-		mutex_lock(&bdev->bd_contains->bd_mutex);
+		mutex_lock_nested(&bdev->bd_contains->bd_mutex,
+				  subclass + 1);
 		bdev->bd_contains->bd_part_count--;
 		mutex_unlock(&bdev->bd_contains->bd_mutex);
 	}
@@ -1051,9 +1112,8 @@ int blkdev_put(struct block_device *bdev)
 		}
 		bdev->bd_disk = NULL;
 		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
-		if (bdev != bdev->bd_contains) {
-			blkdev_put(bdev->bd_contains);
-		}
+		if (bdev != bdev->bd_contains)
+			__blkdev_put(bdev->bd_contains, subclass + 1);
 		bdev->bd_contains = NULL;
 	}
 	unlock_kernel();
@@ -1062,8 +1122,20 @@ int blkdev_put(struct block_device *bdev)
 	return ret;
 }
 
+int blkdev_put(struct block_device *bdev)
+{
+	return __blkdev_put(bdev, BD_MUTEX_NORMAL);
+}
+
 EXPORT_SYMBOL(blkdev_put);
 
+int blkdev_put_partition(struct block_device *bdev)
+{
+	return __blkdev_put(bdev, BD_MUTEX_PARTITION);
+}
+
+EXPORT_SYMBOL(blkdev_put_partition);
+
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e26de68059af..134b32068246 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -435,6 +435,21 @@ struct block_device {
 	unsigned long		bd_private;
 };
 
+/*
+ * bdev->bd_mutex nesting subclasses for the lock validator:
+ *
+ * 0: normal
+ * 1: 'whole'
+ * 2: 'partition'
+ */
+enum bdev_bd_mutex_lock_class
+{
+	BD_MUTEX_NORMAL,
+	BD_MUTEX_WHOLE,
+	BD_MUTEX_PARTITION
+};
+
+
 /*
  * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
  * radix trees
@@ -1425,6 +1440,7 @@ extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
+extern struct block_device *open_partition_by_devnum(dev_t, unsigned);
 extern const struct file_operations def_blk_fops;
 extern const struct address_space_operations def_blk_aops;
 extern const struct file_operations def_chr_fops;
@@ -1435,6 +1451,7 @@ extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *, mode_t, unsigned);
 extern int blkdev_put(struct block_device *);
+extern int blkdev_put_partition(struct block_device *);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From a5b5bb9a053a973c23b867738c074acb3e80c0a0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:35 -0700
Subject: [PATCH] lockdep: annotate sk_locks

Teach sk_lock semantics to the lock validator.  In the softirq path the
slock has mutex_trylock()+mutex_unlock() semantics, in the process context
sock_lock() case it has mutex_lock()/mutex_unlock() semantics.

Thus we treat sock_owned_by_user() flagged areas as an exclusion area too,
not just those areas covered by a held sk_lock.slock.

Effect on non-lockdep kernels: minimal, sk_lock_sock_init() has been turned
into an inline function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/net/sock.h | 20 +++++------
 net/core/sock.c    | 97 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 98 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 0969fb60d6ea..324b3ea233d6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -44,6 +44,7 @@
 #include <linux/timer.h>
 #include <linux/cache.h>
 #include <linux/module.h>
+#include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
@@ -78,18 +79,17 @@ typedef struct {
 	spinlock_t		slock;
 	struct sock_iocb	*owner;
 	wait_queue_head_t	wq;
+	/*
+	 * We express the mutex-alike socket_lock semantics
+	 * to the lock validator by explicitly managing
+	 * the slock as a lock variant (in addition to
+	 * the slock itself):
+	 */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 } socket_lock_t;
 
-extern struct lock_class_key af_family_keys[AF_MAX];
-
-#define sock_lock_init(__sk) \
-do {	spin_lock_init(&((__sk)->sk_lock.slock)); \
-	lockdep_set_class(&(__sk)->sk_lock.slock, \
-			  af_family_keys + (__sk)->sk_family); \
-	(__sk)->sk_lock.owner = NULL; \
-	init_waitqueue_head(&((__sk)->sk_lock.wq)); \
-} while(0)
-
 struct sock;
 struct proto;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 0b4d5d25b23c..51fcfbc041a7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -133,7 +133,42 @@
  * Each address family might have different locking rules, so we have
  * one slock key per address family:
  */
-struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+static const char *af_family_key_strings[AF_MAX+1] = {
+  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
+  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
+  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
+  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
+  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
+  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
+  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
+  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
+  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
+  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-29"          ,
+  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
+};
+static const char *af_family_slock_key_strings[AF_MAX+1] = {
+  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
+  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
+  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
+  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
+  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
+  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
+  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
+  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
+  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
+  "slock-27"       , "slock-28"          , "slock-29"          ,
+  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_MAX"
+};
+#endif
 
 /*
  * sk_callback_lock locking rules are per-address-family,
@@ -249,9 +284,16 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
 	skb->dev = NULL;
 
 	bh_lock_sock(sk);
-	if (!sock_owned_by_user(sk))
+	if (!sock_owned_by_user(sk)) {
+		/*
+		 * trylock + unlock semantics:
+		 */
+		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
 		rc = sk->sk_backlog_rcv(sk, skb);
-	else
+
+		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+	} else
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
 out:
@@ -761,6 +803,33 @@ lenout:
   	return 0;
 }
 
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
+ */
+static void inline sock_lock_init(struct sock *sk)
+{
+	spin_lock_init(&sk->sk_lock.slock);
+	sk->sk_lock.owner = NULL;
+	init_waitqueue_head(&sk->sk_lock.wq);
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)&sk->sk_lock, sizeof(sk->sk_lock));
+
+	/*
+	 * Mark both the sk_lock and the sk_lock.slock as a
+	 * per-address-family lock class:
+	 */
+	lockdep_set_class_and_name(&sk->sk_lock.slock,
+				   af_family_slock_keys + sk->sk_family,
+				   af_family_slock_key_strings[sk->sk_family]);
+	lockdep_init_map(&sk->sk_lock.dep_map,
+			 af_family_key_strings[sk->sk_family],
+			 af_family_keys + sk->sk_family);
+}
+
 /**
  *	sk_alloc - All socket objects are allocated here
  *	@family: protocol family
@@ -1465,24 +1534,34 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 void fastcall lock_sock(struct sock *sk)
 {
 	might_sleep();
-	spin_lock_bh(&(sk->sk_lock.slock));
+	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_lock.owner)
 		__lock_sock(sk);
 	sk->sk_lock.owner = (void *)1;
-	spin_unlock_bh(&(sk->sk_lock.slock));
+	spin_unlock(&sk->sk_lock.slock);
+	/*
+	 * The sk_lock has mutex_lock() semantics here:
+	 */
+	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+	local_bh_enable();
 }
 
 EXPORT_SYMBOL(lock_sock);
 
 void fastcall release_sock(struct sock *sk)
 {
-	spin_lock_bh(&(sk->sk_lock.slock));
+	/*
+	 * The sk_lock has mutex_unlock() semantics:
+	 */
+	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+
+	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-        if (waitqueue_active(&(sk->sk_lock.wq)))
-		wake_up(&(sk->sk_lock.wq));
-	spin_unlock_bh(&(sk->sk_lock.slock));
+	if (waitqueue_active(&sk->sk_lock.wq))
+		wake_up(&sk->sk_lock.wq);
+	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
 
-- 
cgit v1.2.3


From 36c8b586896f60cb91a4fd526233190b34316baf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:41 -0700
Subject: [PATCH] sched: cleanup, remove task_t, convert to struct task_struct

cleanup: remove task_t and convert all the uses to struct task_struct. I
introduced it for the scheduler anno and it was a mistake.

Conversion was mostly scripted, the result was reviewed and all
secondary whitespace and style impact (if any) was fixed up by hand.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/process.c      |   2 +-
 arch/ia64/kernel/mca.c           |  10 +-
 arch/ia64/kernel/smpboot.c       |   2 +-
 arch/mips/kernel/entry.S         |   2 +-
 arch/mips/kernel/mips-mt.c       |   6 +-
 arch/um/kernel/tt/process_kern.c |   2 +-
 drivers/char/tty_io.c            |   2 +-
 fs/eventpoll.c                   |   4 +-
 include/asm-ia64/thread_info.h   |   2 +-
 include/asm-m32r/system.h        |   2 +-
 include/asm-sh/system.h          |   2 +-
 include/linux/sched.h            |  55 +++++------
 kernel/capability.c              |   8 +-
 kernel/exit.c                    |  35 +++----
 kernel/fork.c                    |  18 ++--
 kernel/hrtimer.c                 |   2 +-
 kernel/pid.c                     |   6 +-
 kernel/ptrace.c                  |   6 +-
 kernel/rtmutex-debug.c           |   5 +-
 kernel/rtmutex-tester.c          |   4 +-
 kernel/rtmutex.c                 |  11 ++-
 kernel/sched.c                   | 192 ++++++++++++++++++++-------------------
 kernel/timer.c                   |   2 +-
 kernel/workqueue.c               |   2 +-
 mm/oom_kill.c                    |   8 +-
 25 files changed, 203 insertions(+), 187 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 01c8c8b23337..41ebf51a107a 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -474,7 +474,7 @@ out:
  */
 
 unsigned long
-thread_saved_pc(task_t *t)
+thread_saved_pc(struct task_struct *t)
 {
 	unsigned long base = (unsigned long)task_stack_page(t);
 	unsigned long fp, sp = task_thread_info(t)->pcb.ksp;
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index eb8e8dc5ac8e..2fbe4536fe18 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -678,7 +678,7 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
  */
 
 static void
-ia64_mca_modify_comm(const task_t *previous_current)
+ia64_mca_modify_comm(const struct task_struct *previous_current)
 {
 	char *p, comm[sizeof(current->comm)];
 	if (previous_current->pid)
@@ -709,7 +709,7 @@ ia64_mca_modify_comm(const task_t *previous_current)
  * that we can do backtrace on the MCA/INIT handler code itself.
  */
 
-static task_t *
+static struct task_struct *
 ia64_mca_modify_original_stack(struct pt_regs *regs,
 		const struct switch_stack *sw,
 		struct ia64_sal_os_state *sos,
@@ -719,7 +719,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	ia64_va va;
 	extern char ia64_leave_kernel[];	/* Need asm address, not function descriptor */
 	const pal_min_state_area_t *ms = sos->pal_min_state;
-	task_t *previous_current;
+	struct task_struct *previous_current;
 	struct pt_regs *old_regs;
 	struct switch_stack *old_sw;
 	unsigned size = sizeof(struct pt_regs) +
@@ -1023,7 +1023,7 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	pal_processor_state_info_t *psp = (pal_processor_state_info_t *)
 		&sos->proc_state_param;
 	int recover, cpu = smp_processor_id();
-	task_t *previous_current;
+	struct task_struct *previous_current;
 	struct ia64_mca_notify_die nd =
 		{ .sos = sos, .monarch_cpu = &monarch_cpu };
 
@@ -1352,7 +1352,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 {
 	static atomic_t slaves;
 	static atomic_t monarchs;
-	task_t *previous_current;
+	struct task_struct *previous_current;
 	int cpu = smp_processor_id();
 	struct ia64_mca_notify_die nd =
 		{ .sos = sos, .monarch_cpu = &monarch_cpu };
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index e1960979be29..6203ed4ec8cf 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -124,7 +124,7 @@ extern void __devinit calibrate_delay (void);
 extern void start_ap (void);
 extern unsigned long ia64_iobase;
 
-task_t *task_for_booting_cpu;
+struct task_struct *task_for_booting_cpu;
 
 /*
  * State for each CPU
diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S
index ecfd637d702a..01e7fa86aa43 100644
--- a/arch/mips/kernel/entry.S
+++ b/arch/mips/kernel/entry.S
@@ -65,7 +65,7 @@ need_resched:
 #endif
 
 FEXPORT(ret_from_fork)
-	jal	schedule_tail		# a0 = task_t *prev
+	jal	schedule_tail		# a0 = struct task_struct *prev
 
 FEXPORT(syscall_exit)
 	local_irq_disable		# make sure need_resched and
diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c
index 02237a685ec7..4dcc39f42951 100644
--- a/arch/mips/kernel/mips-mt.c
+++ b/arch/mips/kernel/mips-mt.c
@@ -47,7 +47,7 @@ unsigned long mt_fpemul_threshold = 0;
  * used in sys_sched_set/getaffinity() in kernel/sched.c, so
  * cloned here.
  */
-static inline task_t *find_process_by_pid(pid_t pid)
+static inline struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
 }
@@ -62,7 +62,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 	cpumask_t new_mask;
 	cpumask_t effective_mask;
 	int retval;
-	task_t *p;
+	struct task_struct *p;
 
 	if (len < sizeof(new_mask))
 		return -EINVAL;
@@ -127,7 +127,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 	unsigned int real_len;
 	cpumask_t mask;
 	int retval;
-	task_t *p;
+	struct task_struct *p;
 
 	real_len = sizeof(mask);
 	if (len < real_len)
diff --git a/arch/um/kernel/tt/process_kern.c b/arch/um/kernel/tt/process_kern.c
index a9c1443fc548..8368c2dbe635 100644
--- a/arch/um/kernel/tt/process_kern.c
+++ b/arch/um/kernel/tt/process_kern.c
@@ -119,7 +119,7 @@ void suspend_new_thread(int fd)
 		panic("read failed in suspend_new_thread, err = %d", -err);
 }
 
-void schedule_tail(task_t *prev);
+void schedule_tail(struct task_struct *prev);
 
 static void new_thread_handler(int sig)
 {
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6fb77952562d..bfdb90242a90 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2336,7 +2336,7 @@ static int fionbio(struct file *file, int __user *p)
 
 static int tiocsctty(struct tty_struct *tty, int arg)
 {
-	task_t *p;
+	struct task_struct *p;
 
 	if (current->signal->leader &&
 	    (current->signal->session == tty->session))
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9c677bbd0b08..19ffb043abbc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -120,7 +120,7 @@ struct epoll_filefd {
  */
 struct wake_task_node {
 	struct list_head llink;
-	task_t *task;
+	struct task_struct *task;
 	wait_queue_head_t *wq;
 };
 
@@ -413,7 +413,7 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
 {
 	int wake_nests = 0;
 	unsigned long flags;
-	task_t *this_task = current;
+	struct task_struct *this_task = current;
 	struct list_head *lsthead = &psw->wake_task_list, *lnk;
 	struct wake_task_node *tncur;
 	struct wake_task_node tnode;
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 8bc9869e5765..8adcde0934ca 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -68,7 +68,7 @@ struct thread_info {
 #define end_of_stack(p) (unsigned long *)((void *)(p) + IA64_RBS_OFFSET)
 
 #define __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-#define alloc_task_struct()	((task_t *)__get_free_pages(GFP_KERNEL | __GFP_COMP, KERNEL_STACK_SIZE_ORDER))
+#define alloc_task_struct()	((struct task_struct *)__get_free_pages(GFP_KERNEL | __GFP_COMP, KERNEL_STACK_SIZE_ORDER))
 #define free_task_struct(tsk)	free_pages((unsigned long) (tsk), KERNEL_STACK_SIZE_ORDER)
 
 #endif /* !__ASSEMBLY */
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 66c4742f09e7..311cebf44eff 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -18,7 +18,7 @@
  * switch_to(prev, next) should switch from task `prev' to `next'
  * `prev' will never be the same as `next'.
  *
- * `next' and `prev' should be task_t, but it isn't always defined
+ * `next' and `prev' should be struct task_struct, but it isn't always defined
  */
 
 #define switch_to(prev, next, last)  do { \
diff --git a/include/asm-sh/system.h b/include/asm-sh/system.h
index b752e5cbb830..ce2e60664a86 100644
--- a/include/asm-sh/system.h
+++ b/include/asm-sh/system.h
@@ -12,7 +12,7 @@
  */
 
 #define switch_to(prev, next, last) do {				\
- task_t *__last;							\
+ struct task_struct *__last;						\
  register unsigned long *__ts1 __asm__ ("r1") = &prev->thread.sp;	\
  register unsigned long *__ts2 __asm__ ("r2") = &prev->thread.pc;	\
  register unsigned long *__ts4 __asm__ ("r4") = (unsigned long *)prev;	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ebddba4448d..c2797f04d931 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -184,11 +184,11 @@ extern unsigned long weighted_cpuload(const int cpu);
 extern rwlock_t tasklist_lock;
 extern spinlock_t mmlist_lock;
 
-typedef struct task_struct task_t;
+struct task_struct;
 
 extern void sched_init(void);
 extern void sched_init_smp(void);
-extern void init_idle(task_t *idle, int cpu);
+extern void init_idle(struct task_struct *idle, int cpu);
 
 extern cpumask_t nohz_cpu_mask;
 
@@ -383,7 +383,7 @@ struct signal_struct {
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
 
 	/* current thread group signal load-balancing target: */
-	task_t			*curr_target;
+	struct task_struct	*curr_target;
 
 	/* shared signal handling: */
 	struct sigpending	shared_pending;
@@ -699,7 +699,7 @@ extern int groups_search(struct group_info *group_info, gid_t grp);
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
-extern void prefetch_stack(struct task_struct*);
+extern void prefetch_stack(struct task_struct *t);
 #else
 static inline void prefetch_stack(struct task_struct *t) { }
 #endif
@@ -1031,9 +1031,9 @@ static inline void put_task_struct(struct task_struct *t)
 #define used_math() tsk_used_math(current)
 
 #ifdef CONFIG_SMP
-extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
+extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
 #else
-static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	if (!cpu_isset(0, new_mask))
 		return -EINVAL;
@@ -1042,7 +1042,8 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
 #endif
 
 extern unsigned long long sched_clock(void);
-extern unsigned long long current_sched_time(const task_t *current_task);
+extern unsigned long long
+current_sched_time(const struct task_struct *current_task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
@@ -1060,27 +1061,27 @@ static inline void idle_task_exit(void) {}
 extern void sched_idle_next(void);
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(task_t *p);
-extern void rt_mutex_setprio(task_t *p, int prio);
-extern void rt_mutex_adjust_pi(task_t *p);
+extern int rt_mutex_getprio(struct task_struct *p);
+extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern void rt_mutex_adjust_pi(struct task_struct *p);
 #else
-static inline int rt_mutex_getprio(task_t *p)
+static inline int rt_mutex_getprio(struct task_struct *p)
 {
 	return p->normal_prio;
 }
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 #endif
 
-extern void set_user_nice(task_t *p, long nice);
-extern int task_prio(const task_t *p);
-extern int task_nice(const task_t *p);
-extern int can_nice(const task_t *p, const int nice);
-extern int task_curr(const task_t *p);
+extern void set_user_nice(struct task_struct *p, long nice);
+extern int task_prio(const struct task_struct *p);
+extern int task_nice(const struct task_struct *p);
+extern int can_nice(const struct task_struct *p, const int nice);
+extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
-extern task_t *idle_task(int cpu);
-extern task_t *curr_task(int cpu);
-extern void set_curr_task(int cpu, task_t *p);
+extern struct task_struct *idle_task(int cpu);
+extern struct task_struct *curr_task(int cpu);
+extern void set_curr_task(int cpu, struct task_struct *p);
 
 void yield(void);
 
@@ -1137,8 +1138,8 @@ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
-extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
-extern void FASTCALL(sched_exit(task_t * p));
+extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
+extern void FASTCALL(sched_exit(struct task_struct * p));
 
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
@@ -1243,17 +1244,17 @@ extern NORET_TYPE void do_group_exit(int);
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
 extern int disallow_signal(int);
-extern task_t *child_reaper;
+extern struct task_struct *child_reaper;
 
 extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
-task_t *fork_idle(int);
+struct task_struct *fork_idle(int);
 
 extern void set_task_comm(struct task_struct *tsk, char *from);
 extern void get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_inactive(task_t * p);
+extern void wait_task_inactive(struct task_struct * p);
 #else
 #define wait_task_inactive(p)	do { } while (0)
 #endif
@@ -1279,13 +1280,13 @@ extern void wait_task_inactive(task_t * p);
 /* de_thread depends on thread_group_leader not being a pid based check */
 #define thread_group_leader(p)	(p == p->group_leader)
 
-static inline task_t *next_thread(const task_t *p)
+static inline struct task_struct *next_thread(const struct task_struct *p)
 {
 	return list_entry(rcu_dereference(p->thread_group.next),
-				task_t, thread_group);
+			  struct task_struct, thread_group);
 }
 
-static inline int thread_group_empty(task_t *p)
+static inline int thread_group_empty(struct task_struct *p)
 {
 	return list_empty(&p->thread_group);
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 1a4d8a40d3f9..c7685ad00a97 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
      int ret = 0;
      pid_t pid;
      __u32 version;
-     task_t *target;
+     struct task_struct *target;
      struct __user_cap_data_struct data;
 
      if (get_user(version, &header->version))
@@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
 			      kernel_cap_t *inheritable,
 			      kernel_cap_t *permitted)
 {
-	task_t *g, *target;
+	struct task_struct *g, *target;
 	int ret = -EPERM;
 	int found = 0;
 
@@ -128,7 +128,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
 			       kernel_cap_t *inheritable,
 			       kernel_cap_t *permitted)
 {
-     task_t *g, *target;
+     struct task_struct *g, *target;
      int ret = -EPERM;
      int found = 0;
 
@@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 {
      kernel_cap_t inheritable, permitted, effective;
      __u32 version;
-     task_t *target;
+     struct task_struct *target;
      int ret;
      pid_t pid;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index c595db14cf25..6664c084783d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -134,8 +134,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 
 void release_task(struct task_struct * p)
 {
+	struct task_struct *leader;
 	int zap_leader;
-	task_t *leader;
 repeat:
 	atomic_dec(&p->user->processes);
 	write_lock_irq(&tasklist_lock);
@@ -209,7 +209,7 @@ out:
  *
  * "I ask you, have you ever known what it is to be an orphan?"
  */
-static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
 {
 	struct task_struct *p;
 	int ret = 1;
@@ -582,7 +582,8 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-static inline void choose_new_parent(task_t *p, task_t *reaper)
+static inline void
+choose_new_parent(struct task_struct *p, struct task_struct *reaper)
 {
 	/*
 	 * Make sure we're not reparenting to ourselves and that
@@ -592,7 +593,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper)
 	p->real_parent = reaper;
 }
 
-static void reparent_thread(task_t *p, task_t *father, int traced)
+static void
+reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 {
 	/* We don't want people slaying init.  */
 	if (p->exit_signal != -1)
@@ -656,8 +658,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
  * group, and if no such member exists, give it to
  * the global child reaper process (ie "init")
  */
-static void forget_original_parent(struct task_struct * father,
-					  struct list_head *to_release)
+static void
+forget_original_parent(struct task_struct *father, struct list_head *to_release)
 {
 	struct task_struct *p, *reaper = father;
 	struct list_head *_p, *_n;
@@ -680,7 +682,7 @@ static void forget_original_parent(struct task_struct * father,
 	 */
 	list_for_each_safe(_p, _n, &father->children) {
 		int ptrace;
-		p = list_entry(_p,struct task_struct,sibling);
+		p = list_entry(_p, struct task_struct, sibling);
 
 		ptrace = p->ptrace;
 
@@ -709,7 +711,7 @@ static void forget_original_parent(struct task_struct * father,
 			list_add(&p->ptrace_list, to_release);
 	}
 	list_for_each_safe(_p, _n, &father->ptrace_children) {
-		p = list_entry(_p,struct task_struct,ptrace_list);
+		p = list_entry(_p, struct task_struct, ptrace_list);
 		choose_new_parent(p, reaper);
 		reparent_thread(p, father, 1);
 	}
@@ -829,7 +831,7 @@ static void exit_notify(struct task_struct *tsk)
 
 	list_for_each_safe(_p, _n, &ptrace_dead) {
 		list_del_init(_p);
-		t = list_entry(_p,struct task_struct,ptrace_list);
+		t = list_entry(_p, struct task_struct, ptrace_list);
 		release_task(t);
 	}
 
@@ -1010,7 +1012,7 @@ asmlinkage void sys_exit_group(int error_code)
 	do_group_exit((error_code & 0xff) << 8);
 }
 
-static int eligible_child(pid_t pid, int options, task_t *p)
+static int eligible_child(pid_t pid, int options, struct task_struct *p)
 {
 	if (pid > 0) {
 		if (p->pid != pid)
@@ -1051,12 +1053,13 @@ static int eligible_child(pid_t pid, int options, task_t *p)
 	return 1;
 }
 
-static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
+static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
 			       int why, int status,
 			       struct siginfo __user *infop,
 			       struct rusage __user *rusagep)
 {
 	int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+
 	put_task_struct(p);
 	if (!retval)
 		retval = put_user(SIGCHLD, &infop->si_signo);
@@ -1081,7 +1084,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(task_t *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int noreap,
 			    struct siginfo __user *infop,
 			    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1243,8 +1246,8 @@ static int wait_task_zombie(task_t *p, int noreap,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
-			     struct siginfo __user *infop,
+static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+			     int noreap, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
 	int retval, exit_code;
@@ -1358,7 +1361,7 @@ bail_ref:
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_continued(task_t *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int noreap,
 			       struct siginfo __user *infop,
 			       int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1444,7 +1447,7 @@ repeat:
 		int ret;
 
 		list_for_each(_p,&tsk->children) {
-			p = list_entry(_p,struct task_struct,sibling);
+			p = list_entry(_p, struct task_struct, sibling);
 
 			ret = eligible_child(pid, options, p);
 			if (!ret)
diff --git a/kernel/fork.c b/kernel/fork.c
index 54953d8a6f17..56e4e07e45f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -933,13 +933,13 @@ static inline void rt_mutex_init_task(struct task_struct *p)
  * parts of the process environment (as per the clone
  * flags). The actual kick-off is left to the caller.
  */
-static task_t *copy_process(unsigned long clone_flags,
-				 unsigned long stack_start,
-				 struct pt_regs *regs,
-				 unsigned long stack_size,
-				 int __user *parent_tidptr,
-				 int __user *child_tidptr,
-				 int pid)
+static struct task_struct *copy_process(unsigned long clone_flags,
+					unsigned long stack_start,
+					struct pt_regs *regs,
+					unsigned long stack_size,
+					int __user *parent_tidptr,
+					int __user *child_tidptr,
+					int pid)
 {
 	int retval;
 	struct task_struct *p = NULL;
@@ -1294,9 +1294,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
 	return regs;
 }
 
-task_t * __devinit fork_idle(int cpu)
+struct task_struct * __devinit fork_idle(int cpu)
 {
-	task_t *task;
+	struct task_struct *task;
 	struct pt_regs regs;
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 617304ce67db..d17766d40dab 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -669,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
diff --git a/kernel/pid.c b/kernel/pid.c
index eeb836b65ca4..93e212f20671 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -218,7 +218,7 @@ struct pid * fastcall find_pid(int nr)
 	return NULL;
 }
 
-int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
+int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
 {
 	struct pid_link *link;
 	struct pid *pid;
@@ -233,7 +233,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 	return 0;
 }
 
-void fastcall detach_pid(task_t *task, enum pid_type type)
+void fastcall detach_pid(struct task_struct *task, enum pid_type type)
 {
 	struct pid_link *link;
 	struct pid *pid;
@@ -267,7 +267,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 /*
  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
-task_t *find_task_by_pid_type(int type, int nr)
+struct task_struct *find_task_by_pid_type(int type, int nr)
 {
 	return pid_task(find_pid(nr), type);
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 335c5b932e14..9a111f70145c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,7 +28,7 @@
  *
  * Must be called with the tasklist lock write-held.
  */
-void __ptrace_link(task_t *child, task_t *new_parent)
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
 	BUG_ON(!list_empty(&child->ptrace_list));
 	if (child->parent == new_parent)
@@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent)
  * TASK_TRACED, resume it now.
  * Requires that irqs be disabled.
  */
-void ptrace_untrace(task_t *child)
+void ptrace_untrace(struct task_struct *child)
 {
 	spin_lock(&child->sighand->siglock);
 	if (child->state == TASK_TRACED) {
@@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child)
  *
  * Must be called with the tasklist lock write-held.
  */
-void __ptrace_unlink(task_t *child)
+void __ptrace_unlink(struct task_struct *child)
 {
 	BUG_ON(!child->ptrace);
 
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 353a853bc390..0c1faa950af7 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -96,7 +96,7 @@ void deadlock_trace_off(void)
 	rt_trace_on = 0;
 }
 
-static void printk_task(task_t *p)
+static void printk_task(struct task_struct *p)
 {
 	if (p)
 		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
@@ -231,7 +231,8 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
 	lock->name = name;
 }
 
-void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
 {
 }
 
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e82c2f848249..494dac872a13 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -33,7 +33,7 @@ struct test_thread_data {
 };
 
 static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static task_t *threads[MAX_RT_TEST_THREADS];
+static struct task_struct *threads[MAX_RT_TEST_THREADS];
 static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
 
 enum test_opcodes {
@@ -361,8 +361,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
 static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
 {
 	struct test_thread_data *td;
+	struct task_struct *tsk;
 	char *curr = buf;
-	task_t *tsk;
 	int i;
 
 	td = container_of(dev, struct test_thread_data, sysdev);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 91b699aa658b..d2ef13b485e7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -157,7 +157,7 @@ int max_lock_depth = 1024;
  * Decreases task's usage by one - may thus free the task.
  * Returns 0 or -EDEADLK.
  */
-static int rt_mutex_adjust_prio_chain(task_t *task,
+static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 				      int deadlock_detect,
 				      struct rt_mutex *orig_lock,
 				      struct rt_mutex_waiter *orig_waiter,
@@ -282,6 +282,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task,
 	spin_unlock_irqrestore(&task->pi_lock, flags);
  out_put_task:
 	put_task_struct(task);
+
 	return ret;
 }
 
@@ -403,10 +404,10 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
 				   int detect_deadlock)
 {
+	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *top_waiter = waiter;
-	task_t *owner = rt_mutex_owner(lock);
-	int boost = 0, res;
 	unsigned long flags;
+	int boost = 0, res;
 
 	spin_lock_irqsave(&current->pi_lock, flags);
 	__rt_mutex_adjust_prio(current);
@@ -527,9 +528,9 @@ static void remove_waiter(struct rt_mutex *lock,
 			  struct rt_mutex_waiter *waiter)
 {
 	int first = (waiter == rt_mutex_top_waiter(lock));
-	int boost = 0;
-	task_t *owner = rt_mutex_owner(lock);
+	struct task_struct *owner = rt_mutex_owner(lock);
 	unsigned long flags;
+	int boost = 0;
 
 	spin_lock_irqsave(&current->pi_lock, flags);
 	plist_del(&waiter->list_entry, &lock->wait_list);
diff --git a/kernel/sched.c b/kernel/sched.c
index b0326141f841..021b31219516 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -179,7 +179,7 @@ static unsigned int static_prio_timeslice(int static_prio)
 		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
 
-static inline unsigned int task_timeslice(task_t *p)
+static inline unsigned int task_timeslice(struct task_struct *p)
 {
 	return static_prio_timeslice(p->static_prio);
 }
@@ -227,7 +227,7 @@ struct runqueue {
 
 	unsigned long expired_timestamp;
 	unsigned long long timestamp_last_tick;
-	task_t *curr, *idle;
+	struct task_struct *curr, *idle;
 	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
 	int best_expired_prio;
@@ -240,7 +240,7 @@ struct runqueue {
 	int active_balance;
 	int push_cpu;
 
-	task_t *migration_thread;
+	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 #endif
 
@@ -291,16 +291,16 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #endif
 
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline int task_running(runqueue_t *rq, task_t *p)
+static inline int task_running(runqueue_t *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 
-static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_lock_switch(runqueue_t *rq, struct task_struct *next)
 {
 }
 
-static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_lock_switch(runqueue_t *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
@@ -317,7 +317,7 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(runqueue_t *rq, task_t *p)
+static inline int task_running(runqueue_t *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
@@ -326,7 +326,7 @@ static inline int task_running(runqueue_t *rq, task_t *p)
 #endif
 }
 
-static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_lock_switch(runqueue_t *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
@@ -343,7 +343,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
 #endif
 }
 
-static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_lock_switch(runqueue_t *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
@@ -364,7 +364,7 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
-static inline runqueue_t *__task_rq_lock(task_t *p)
+static inline runqueue_t *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct runqueue *rq;
@@ -384,7 +384,7 @@ repeat_lock_task:
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct runqueue *rq;
@@ -541,7 +541,7 @@ static inline runqueue_t *this_rq_lock(void)
  * long it was from the *first* time it was queued to the time that it
  * finally hit a cpu.
  */
-static inline void sched_info_dequeued(task_t *t)
+static inline void sched_info_dequeued(struct task_struct *t)
 {
 	t->sched_info.last_queued = 0;
 }
@@ -551,7 +551,7 @@ static inline void sched_info_dequeued(task_t *t)
  * long it was waiting to run.  We also note when it began so that we
  * can keep stats on how long its timeslice is.
  */
-static void sched_info_arrive(task_t *t)
+static void sched_info_arrive(struct task_struct *t)
 {
 	unsigned long now = jiffies, diff = 0;
 	struct runqueue *rq = task_rq(t);
@@ -585,7 +585,7 @@ static void sched_info_arrive(task_t *t)
  * the timestamp if it is already not set.  It's assumed that
  * sched_info_dequeued() will clear that stamp when appropriate.
  */
-static inline void sched_info_queued(task_t *t)
+static inline void sched_info_queued(struct task_struct *t)
 {
 	if (!t->sched_info.last_queued)
 		t->sched_info.last_queued = jiffies;
@@ -595,7 +595,7 @@ static inline void sched_info_queued(task_t *t)
  * Called when a process ceases being the active-running process, either
  * voluntarily or involuntarily.  Now we can calculate how long we ran.
  */
-static inline void sched_info_depart(task_t *t)
+static inline void sched_info_depart(struct task_struct *t)
 {
 	struct runqueue *rq = task_rq(t);
 	unsigned long diff = jiffies - t->sched_info.last_arrival;
@@ -611,7 +611,8 @@ static inline void sched_info_depart(task_t *t)
  * their time slice.  (This may also be called when switching to or from
  * the idle task.)  We are only called when prev != next.
  */
-static inline void sched_info_switch(task_t *prev, task_t *next)
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct runqueue *rq = task_rq(prev);
 
@@ -683,7 +684,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
  * Both properties are important to certain workloads.
  */
 
-static inline int __normal_prio(task_t *p)
+static inline int __normal_prio(struct task_struct *p)
 {
 	int bonus, prio;
 
@@ -719,7 +720,7 @@ static inline int __normal_prio(task_t *p)
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 
-static void set_load_weight(task_t *p)
+static void set_load_weight(struct task_struct *p)
 {
 	if (has_rt_policy(p)) {
 #ifdef CONFIG_SMP
@@ -737,23 +738,25 @@ static void set_load_weight(task_t *p)
 		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
 }
 
-static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+static inline void
+inc_raw_weighted_load(runqueue_t *rq, const struct task_struct *p)
 {
 	rq->raw_weighted_load += p->load_weight;
 }
 
-static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+static inline void
+dec_raw_weighted_load(runqueue_t *rq, const struct task_struct *p)
 {
 	rq->raw_weighted_load -= p->load_weight;
 }
 
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+static inline void inc_nr_running(struct task_struct *p, runqueue_t *rq)
 {
 	rq->nr_running++;
 	inc_raw_weighted_load(rq, p);
 }
 
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+static inline void dec_nr_running(struct task_struct *p, runqueue_t *rq)
 {
 	rq->nr_running--;
 	dec_raw_weighted_load(rq, p);
@@ -766,7 +769,7 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
-static inline int normal_prio(task_t *p)
+static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 
@@ -784,7 +787,7 @@ static inline int normal_prio(task_t *p)
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
-static int effective_prio(task_t *p)
+static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
@@ -800,7 +803,7 @@ static int effective_prio(task_t *p)
 /*
  * __activate_task - move a task to the runqueue.
  */
-static void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(struct task_struct *p, runqueue_t *rq)
 {
 	prio_array_t *target = rq->active;
 
@@ -813,7 +816,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
 /*
  * __activate_idle_task - move idle task to the _front_ of runqueue.
  */
-static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+static inline void __activate_idle_task(struct task_struct *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
 	inc_nr_running(p, rq);
@@ -823,7 +826,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
  * Recalculate p->normal_prio and p->prio after having slept,
  * updating the sleep-average too:
  */
-static int recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(struct task_struct *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long sleep_time = now - p->timestamp;
@@ -895,7 +898,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
  * Update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
-static void activate_task(task_t *p, runqueue_t *rq, int local)
+static void activate_task(struct task_struct *p, runqueue_t *rq, int local)
 {
 	unsigned long long now;
 
@@ -962,7 +965,7 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void resched_task(task_t *p)
+static void resched_task(struct task_struct *p)
 {
 	int cpu;
 
@@ -983,7 +986,7 @@ static void resched_task(task_t *p)
 		smp_send_reschedule(cpu);
 }
 #else
-static inline void resched_task(task_t *p)
+static inline void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
@@ -994,7 +997,7 @@ static inline void resched_task(task_t *p)
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
-inline int task_curr(const task_t *p)
+inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
@@ -1009,7 +1012,7 @@ unsigned long weighted_cpuload(const int cpu)
 typedef struct {
 	struct list_head list;
 
-	task_t *task;
+	struct task_struct *task;
 	int dest_cpu;
 
 	struct completion done;
@@ -1019,7 +1022,8 @@ typedef struct {
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
+static int
+migrate_task(struct task_struct *p, int dest_cpu, migration_req_t *req)
 {
 	runqueue_t *rq = task_rq(p);
 
@@ -1049,7 +1053,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(task_t *p)
+void wait_task_inactive(struct task_struct *p)
 {
 	unsigned long flags;
 	runqueue_t *rq;
@@ -1083,7 +1087,7 @@ repeat:
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
-void kick_process(task_t *p)
+void kick_process(struct task_struct *p)
 {
 	int cpu;
 
@@ -1286,7 +1290,7 @@ nextlevel:
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, task_t *p)
+static int wake_idle(int cpu, struct task_struct *p)
 {
 	cpumask_t tmp;
 	struct sched_domain *sd;
@@ -1309,7 +1313,7 @@ static int wake_idle(int cpu, task_t *p)
 	return cpu;
 }
 #else
-static inline int wake_idle(int cpu, task_t *p)
+static inline int wake_idle(int cpu, struct task_struct *p)
 {
 	return cpu;
 }
@@ -1329,7 +1333,7 @@ static inline int wake_idle(int cpu, task_t *p)
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -1487,14 +1491,14 @@ out:
 	return success;
 }
 
-int fastcall wake_up_process(task_t *p)
+int fastcall wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
 				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 
-int fastcall wake_up_state(task_t *p, unsigned int state)
+int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
@@ -1503,7 +1507,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
-void fastcall sched_fork(task_t *p, int clone_flags)
+void fastcall sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
 
@@ -1571,7 +1575,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
-void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
+void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	int this_cpu, cpu;
@@ -1655,7 +1659,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
  * artificially, because any timeslice recovered here
  * was given away by the parent in the first place.)
  */
-void fastcall sched_exit(task_t *p)
+void fastcall sched_exit(struct task_struct *p)
 {
 	unsigned long flags;
 	runqueue_t *rq;
@@ -1689,7 +1693,7 @@ void fastcall sched_exit(task_t *p)
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
-static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_task_switch(runqueue_t *rq, struct task_struct *next)
 {
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
@@ -1710,7 +1714,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
@@ -1748,7 +1752,7 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
-asmlinkage void schedule_tail(task_t *prev)
+asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	runqueue_t *rq = this_rq();
@@ -1765,8 +1769,9 @@ asmlinkage void schedule_tail(task_t *prev)
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
-static inline
-task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
+static inline struct task_struct *
+context_switch(runqueue_t *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
@@ -1937,7 +1942,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
  * the cpu_allowed mask is restored.
  */
-static void sched_migrate_task(task_t *p, int dest_cpu)
+static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 {
 	migration_req_t req;
 	runqueue_t *rq;
@@ -1952,11 +1957,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
+
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
+
 		return;
 	}
 out:
@@ -1980,9 +1987,9 @@ void sched_exec(void)
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static
-void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
-	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+static void pull_task(runqueue_t *src_rq, prio_array_t *src_array,
+		      struct task_struct *p, runqueue_t *this_rq,
+		      prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
 	dec_nr_running(p, src_rq);
@@ -2003,7 +2010,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
-int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
+int can_migrate_task(struct task_struct *p, runqueue_t *rq, int this_cpu,
 		     struct sched_domain *sd, enum idle_type idle,
 		     int *all_pinned)
 {
@@ -2052,8 +2059,8 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 	    best_prio_seen, skip_for_load;
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
+	struct task_struct *tmp;
 	long rem_load_move;
-	task_t *tmp;
 
 	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
@@ -2105,7 +2112,7 @@ skip_bitmap:
 	head = array->queue + idx;
 	curr = head->prev;
 skip_queue:
-	tmp = list_entry(curr, task_t, run_list);
+	tmp = list_entry(curr, struct task_struct, run_list);
 
 	curr = curr->prev;
 
@@ -2819,7 +2826,7 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
  */
 static inline void
-update_cpu_clock(task_t *p, runqueue_t *rq, unsigned long long now)
+update_cpu_clock(struct task_struct *p, runqueue_t *rq, unsigned long long now)
 {
 	p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
 }
@@ -2828,7 +2835,7 @@ update_cpu_clock(task_t *p, runqueue_t *rq, unsigned long long now)
  * Return current->sched_time plus any more ns on the sched_clock
  * that have not yet been banked.
  */
-unsigned long long current_sched_time(const task_t *p)
+unsigned long long current_sched_time(const struct task_struct *p)
 {
 	unsigned long long ns;
 	unsigned long flags;
@@ -2945,9 +2952,9 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 void scheduler_tick(void)
 {
 	unsigned long long now = sched_clock();
+	struct task_struct *p = current;
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
-	task_t *p = current;
 
 	update_cpu_clock(p, rq, now);
 
@@ -3079,7 +3086,8 @@ static void wake_sleeping_dependent(int this_cpu)
  * utilize, if another task runs on a sibling. This models the
  * slowdown effect of other tasks running on siblings:
  */
-static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+static inline unsigned long
+smt_slice(struct task_struct *p, struct sched_domain *sd)
 {
 	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
@@ -3090,7 +3098,8 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
  * acquire their lock. As we only trylock the normal locking order does not
  * need to be obeyed.
  */
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
+static int
+dependent_sleeper(int this_cpu, runqueue_t *this_rq, struct task_struct *p)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int ret = 0, i;
@@ -3110,8 +3119,8 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
 		return 0;
 
 	for_each_cpu_mask(i, sd->span) {
+		struct task_struct *smt_curr;
 		runqueue_t *smt_rq;
-		task_t *smt_curr;
 
 		if (i == this_cpu)
 			continue;
@@ -3157,7 +3166,7 @@ static inline void wake_sleeping_dependent(int this_cpu)
 {
 }
 static inline int
-dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
+dependent_sleeper(int this_cpu, runqueue_t *this_rq, struct task_struct *p)
 {
 	return 0;
 }
@@ -3211,11 +3220,11 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
  */
 asmlinkage void __sched schedule(void)
 {
+	struct task_struct *prev, *next;
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
 	int cpu, idx, new_prio;
-	task_t *prev, *next;
 	prio_array_t *array;
 	long *switch_count;
 	runqueue_t *rq;
@@ -3308,7 +3317,7 @@ need_resched_nonpreemptible:
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
-	next = list_entry(queue->next, task_t, run_list);
+	next = list_entry(queue->next, struct task_struct, run_list);
 
 	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
 		unsigned long long delta = now - next->timestamp;
@@ -3776,7 +3785,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
  *
  * Used by the rt_mutex code to implement priority inheritance logic.
  */
-void rt_mutex_setprio(task_t *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
 	prio_array_t *array;
@@ -3817,7 +3826,7 @@ void rt_mutex_setprio(task_t *p, int prio)
 
 #endif
 
-void set_user_nice(task_t *p, long nice)
+void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta;
 	unsigned long flags;
@@ -3873,7 +3882,7 @@ EXPORT_SYMBOL(set_user_nice);
  * @p: task
  * @nice: nice value
  */
-int can_nice(const task_t *p, const int nice)
+int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
@@ -3932,7 +3941,7 @@ asmlinkage long sys_nice(int increment)
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
-int task_prio(const task_t *p)
+int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
@@ -3941,7 +3950,7 @@ int task_prio(const task_t *p)
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
-int task_nice(const task_t *p)
+int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
@@ -3960,7 +3969,7 @@ int idle_cpu(int cpu)
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
-task_t *idle_task(int cpu)
+struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
@@ -3969,7 +3978,7 @@ task_t *idle_task(int cpu)
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
-static inline task_t *find_process_by_pid(pid_t pid)
+static inline struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
 }
@@ -4103,9 +4112,9 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
-	int retval;
 	struct sched_param lparam;
 	struct task_struct *p;
+	int retval;
 
 	if (!param || pid < 0)
 		return -EINVAL;
@@ -4121,6 +4130,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 	read_unlock_irq(&tasklist_lock);
 	retval = sched_setscheduler(p, policy, &lparam);
 	put_task_struct(p);
+
 	return retval;
 }
 
@@ -4156,8 +4166,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
  */
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
+	struct task_struct *p;
 	int retval = -EINVAL;
-	task_t *p;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -4184,8 +4194,8 @@ out_nounlock:
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 {
 	struct sched_param lp;
+	struct task_struct *p;
 	int retval = -EINVAL;
-	task_t *p;
 
 	if (!param || pid < 0)
 		goto out_nounlock;
@@ -4218,9 +4228,9 @@ out_unlock:
 
 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 {
-	task_t *p;
-	int retval;
 	cpumask_t cpus_allowed;
+	struct task_struct *p;
+	int retval;
 
 	lock_cpu_hotplug();
 	read_lock(&tasklist_lock);
@@ -4306,8 +4316,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
 
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
+	struct task_struct *p;
 	int retval;
-	task_t *p;
 
 	lock_cpu_hotplug();
 	read_lock(&tasklist_lock);
@@ -4592,9 +4602,9 @@ asmlinkage long sys_sched_get_priority_min(int policy)
 asmlinkage
 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
+	struct task_struct *p;
 	int retval = -EINVAL;
 	struct timespec t;
-	task_t *p;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -4641,12 +4651,13 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
 	return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static void show_task(task_t *p)
+static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+
+static void show_task(struct task_struct *p)
 {
-	task_t *relative;
-	unsigned state;
+	struct task_struct *relative;
 	unsigned long free = 0;
-	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+	unsigned state;
 
 	printk("%-13.13s ", p->comm);
 	state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4697,7 +4708,7 @@ static void show_task(task_t *p)
 
 void show_state(void)
 {
-	task_t *g, *p;
+	struct task_struct *g, *p;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -4730,7 +4741,7 @@ void show_state(void)
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void __devinit init_idle(task_t *idle, int cpu)
+void __devinit init_idle(struct task_struct *idle, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
@@ -4793,7 +4804,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
  * task must not exit() & deallocate itself prematurely.  The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	unsigned long flags;
 	migration_req_t req;
@@ -5061,7 +5072,7 @@ void idle_task_exit(void)
 	mmdrop(mm);
 }
 
-static void migrate_dead(unsigned int dead_cpu, task_t *p)
+static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
 	struct runqueue *rq = cpu_rq(dead_cpu);
 
@@ -5096,9 +5107,8 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 			struct list_head *list = &rq->arrays[arr].queue[i];
 
 			while (!list_empty(list))
-				migrate_dead(dead_cpu,
-					     list_entry(list->next, task_t,
-							run_list));
+				migrate_dead(dead_cpu, list_entry(list->next,
+					     struct task_struct, run_list));
 		}
 	}
 }
@@ -6801,7 +6811,7 @@ void normalize_rt_tasks(void)
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
-task_t *curr_task(int cpu)
+struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
@@ -6821,7 +6831,7 @@ task_t *curr_task(int cpu)
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
-void set_curr_task(int cpu, task_t *p)
+void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
diff --git a/kernel/timer.c b/kernel/timer.c
index b761898d04c8..396a3c024c2c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1368,7 +1368,7 @@ asmlinkage long sys_getegid(void)
 
 static void process_timeout(unsigned long __data)
 {
-	wake_up_process((task_t *)__data);
+	wake_up_process((struct task_struct *)__data);
 }
 
 /**
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 59f0b42bd89e..90d2c6001659 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,7 +51,7 @@ struct cpu_workqueue_struct {
 	wait_queue_head_t work_done;
 
 	struct workqueue_struct *wq;
-	task_t *thread;
+	struct task_struct *thread;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
 } ____cacheline_aligned;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d46ed0f1dc06..b9af136e5cfa 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -225,7 +225,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
  * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
  * we select a process with CAP_SYS_RAW_IO set).
  */
-static void __oom_kill_task(task_t *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, const char *message)
 {
 	if (p->pid == 1) {
 		WARN_ON(1);
@@ -255,10 +255,10 @@ static void __oom_kill_task(task_t *p, const char *message)
 	force_sig(SIGKILL, p);
 }
 
-static int oom_kill_task(task_t *p, const char *message)
+static int oom_kill_task(struct task_struct *p, const char *message)
 {
 	struct mm_struct *mm;
-	task_t * g, * q;
+	struct task_struct *g, *q;
 
 	mm = p->mm;
 
@@ -316,7 +316,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
  */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
-	task_t *p;
+	struct task_struct *p;
 	unsigned long points = 0;
 
 	if (printk_ratelimit()) {
-- 
cgit v1.2.3


From 70b97a7f0b19cf1f2619deb5cc41e8b78c591aa7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:42 -0700
Subject: [PATCH] sched: cleanup, convert sched.c-internal typedefs to struct

convert:

 - runqueue_t to 'struct rq'
 - prio_array_t to 'struct prio_array'
 - migration_req_t to 'struct migration_req'

I was the one who added these but they are both against the kernel coding
style and also were used inconsistently at places.  So just get rid of them at
once, now that we are flushing the scheduler patch-queue anyway.

Conversion was mostly scripted, the result was reviewed and all secondary
whitespace and style impact (if any) was fixed up by hand.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   5 +-
 kernel/sched.c        | 250 +++++++++++++++++++++++++-------------------------
 2 files changed, 128 insertions(+), 127 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c2797f04d931..1c876e27ff93 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -534,7 +534,6 @@ extern struct user_struct *find_user(uid_t);
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
-typedef struct prio_array prio_array_t;
 struct backing_dev_info;
 struct reclaim_state;
 
@@ -715,6 +714,8 @@ enum sleep_type {
 	SLEEP_INTERRUPTED,
 };
 
+struct prio_array;
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -732,7 +733,7 @@ struct task_struct {
 	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio, normal_prio;
 	struct list_head run_list;
-	prio_array_t *array;
+	struct prio_array *array;
 
 	unsigned short ioprio;
 	unsigned int btrace_seq;
diff --git a/kernel/sched.c b/kernel/sched.c
index 021b31219516..4ee400f9d56b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -188,8 +188,6 @@ static inline unsigned int task_timeslice(struct task_struct *p)
  * These are the runqueue data structures:
  */
 
-typedef struct runqueue runqueue_t;
-
 struct prio_array {
 	unsigned int nr_active;
 	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
@@ -203,7 +201,7 @@ struct prio_array {
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
-struct runqueue {
+struct rq {
 	spinlock_t lock;
 
 	/*
@@ -229,7 +227,7 @@ struct runqueue {
 	unsigned long long timestamp_last_tick;
 	struct task_struct *curr, *idle;
 	struct mm_struct *prev_mm;
-	prio_array_t *active, *expired, arrays[2];
+	struct prio_array *active, *expired, arrays[2];
 	int best_expired_prio;
 	atomic_t nr_iowait;
 
@@ -266,7 +264,7 @@ struct runqueue {
 	struct lock_class_key rq_lock_key;
 };
 
-static DEFINE_PER_CPU(struct runqueue, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues);
 
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -291,16 +289,16 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #endif
 
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline int task_running(runqueue_t *rq, struct task_struct *p)
+static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 
-static inline void prepare_lock_switch(runqueue_t *rq, struct task_struct *next)
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 }
 
-static inline void finish_lock_switch(runqueue_t *rq, struct task_struct *prev)
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
@@ -317,7 +315,7 @@ static inline void finish_lock_switch(runqueue_t *rq, struct task_struct *prev)
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(runqueue_t *rq, struct task_struct *p)
+static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
@@ -326,7 +324,7 @@ static inline int task_running(runqueue_t *rq, struct task_struct *p)
 #endif
 }
 
-static inline void prepare_lock_switch(runqueue_t *rq, struct task_struct *next)
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
@@ -343,7 +341,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, struct task_struct *next)
 #endif
 }
 
-static inline void finish_lock_switch(runqueue_t *rq, struct task_struct *prev)
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
@@ -364,10 +362,10 @@ static inline void finish_lock_switch(runqueue_t *rq, struct task_struct *prev)
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
-static inline runqueue_t *__task_rq_lock(struct task_struct *p)
+static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
-	struct runqueue *rq;
+	struct rq *rq;
 
 repeat_lock_task:
 	rq = task_rq(p);
@@ -384,10 +382,10 @@ repeat_lock_task:
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static runqueue_t *task_rq_lock(struct task_struct *p, unsigned long *flags)
+static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
-	struct runqueue *rq;
+	struct rq *rq;
 
 repeat_lock_task:
 	local_irq_save(*flags);
@@ -400,13 +398,13 @@ repeat_lock_task:
 	return rq;
 }
 
-static inline void __task_rq_unlock(runqueue_t *rq)
+static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	spin_unlock(&rq->lock);
 }
 
-static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
@@ -426,7 +424,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 	seq_printf(seq, "timestamp %lu\n", jiffies);
 	for_each_online_cpu(cpu) {
-		runqueue_t *rq = cpu_rq(cpu);
+		struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_SMP
 		struct sched_domain *sd;
 		int dcnt = 0;
@@ -513,10 +511,10 @@ struct file_operations proc_schedstat_operations = {
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
-static inline runqueue_t *this_rq_lock(void)
+static inline struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
-	runqueue_t *rq;
+	struct rq *rq;
 
 	local_irq_disable();
 	rq = this_rq();
@@ -554,7 +552,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
 static void sched_info_arrive(struct task_struct *t)
 {
 	unsigned long now = jiffies, diff = 0;
-	struct runqueue *rq = task_rq(t);
+	struct rq *rq = task_rq(t);
 
 	if (t->sched_info.last_queued)
 		diff = now - t->sched_info.last_queued;
@@ -597,7 +595,7 @@ static inline void sched_info_queued(struct task_struct *t)
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
-	struct runqueue *rq = task_rq(t);
+	struct rq *rq = task_rq(t);
 	unsigned long diff = jiffies - t->sched_info.last_arrival;
 
 	t->sched_info.cpu_time += diff;
@@ -614,7 +612,7 @@ static inline void sched_info_depart(struct task_struct *t)
 static inline void
 sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
-	struct runqueue *rq = task_rq(prev);
+	struct rq *rq = task_rq(prev);
 
 	/*
 	 * prev now departs the cpu.  It's not interesting to record
@@ -635,7 +633,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 /*
  * Adding/removing a task to/from a priority array:
  */
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, struct prio_array *array)
 {
 	array->nr_active--;
 	list_del(&p->run_list);
@@ -643,7 +641,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array)
 		__clear_bit(p->prio, array->bitmap);
 }
 
-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, struct prio_array *array)
 {
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
@@ -656,12 +654,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
-static void requeue_task(struct task_struct *p, prio_array_t *array)
+static void requeue_task(struct task_struct *p, struct prio_array *array)
 {
 	list_move_tail(&p->run_list, array->queue + p->prio);
 }
 
-static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+static inline void
+enqueue_task_head(struct task_struct *p, struct prio_array *array)
 {
 	list_add(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
@@ -739,24 +738,24 @@ static void set_load_weight(struct task_struct *p)
 }
 
 static inline void
-inc_raw_weighted_load(runqueue_t *rq, const struct task_struct *p)
+inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
 	rq->raw_weighted_load += p->load_weight;
 }
 
 static inline void
-dec_raw_weighted_load(runqueue_t *rq, const struct task_struct *p)
+dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
 	rq->raw_weighted_load -= p->load_weight;
 }
 
-static inline void inc_nr_running(struct task_struct *p, runqueue_t *rq)
+static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
 	inc_raw_weighted_load(rq, p);
 }
 
-static inline void dec_nr_running(struct task_struct *p, runqueue_t *rq)
+static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
 	dec_raw_weighted_load(rq, p);
@@ -803,9 +802,9 @@ static int effective_prio(struct task_struct *p)
 /*
  * __activate_task - move a task to the runqueue.
  */
-static void __activate_task(struct task_struct *p, runqueue_t *rq)
+static void __activate_task(struct task_struct *p, struct rq *rq)
 {
-	prio_array_t *target = rq->active;
+	struct prio_array *target = rq->active;
 
 	if (batch_task(p))
 		target = rq->expired;
@@ -816,7 +815,7 @@ static void __activate_task(struct task_struct *p, runqueue_t *rq)
 /*
  * __activate_idle_task - move idle task to the _front_ of runqueue.
  */
-static inline void __activate_idle_task(struct task_struct *p, runqueue_t *rq)
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
 {
 	enqueue_task_head(p, rq->active);
 	inc_nr_running(p, rq);
@@ -898,7 +897,7 @@ static int recalc_task_prio(struct task_struct *p, unsigned long long now)
  * Update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
-static void activate_task(struct task_struct *p, runqueue_t *rq, int local)
+static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
 	unsigned long long now;
 
@@ -906,7 +905,7 @@ static void activate_task(struct task_struct *p, runqueue_t *rq, int local)
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
-		runqueue_t *this_rq = this_rq();
+		struct rq *this_rq = this_rq();
 		now = (now - this_rq->timestamp_last_tick)
 			+ rq->timestamp_last_tick;
 	}
@@ -945,7 +944,7 @@ static void activate_task(struct task_struct *p, runqueue_t *rq, int local)
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+static void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
@@ -1009,23 +1008,23 @@ unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_SMP
-typedef struct {
+struct migration_req {
 	struct list_head list;
 
 	struct task_struct *task;
 	int dest_cpu;
 
 	struct completion done;
-} migration_req_t;
+};
 
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
 static int
-migrate_task(struct task_struct *p, int dest_cpu, migration_req_t *req)
+migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
-	runqueue_t *rq = task_rq(p);
+	struct rq *rq = task_rq(p);
 
 	/*
 	 * If the task is not on a runqueue (and not running), then
@@ -1056,7 +1055,7 @@ migrate_task(struct task_struct *p, int dest_cpu, migration_req_t *req)
 void wait_task_inactive(struct task_struct *p)
 {
 	unsigned long flags;
-	runqueue_t *rq;
+	struct rq *rq;
 	int preempted;
 
 repeat:
@@ -1107,7 +1106,7 @@ void kick_process(struct task_struct *p)
  */
 static inline unsigned long source_load(int cpu, int type)
 {
-	runqueue_t *rq = cpu_rq(cpu);
+	struct rq *rq = cpu_rq(cpu);
 
 	if (type == 0)
 		return rq->raw_weighted_load;
@@ -1121,7 +1120,7 @@ static inline unsigned long source_load(int cpu, int type)
  */
 static inline unsigned long target_load(int cpu, int type)
 {
-	runqueue_t *rq = cpu_rq(cpu);
+	struct rq *rq = cpu_rq(cpu);
 
 	if (type == 0)
 		return rq->raw_weighted_load;
@@ -1134,7 +1133,7 @@ static inline unsigned long target_load(int cpu, int type)
  */
 static inline unsigned long cpu_avg_load_per_task(int cpu)
 {
-	runqueue_t *rq = cpu_rq(cpu);
+	struct rq *rq = cpu_rq(cpu);
 	unsigned long n = rq->nr_running;
 
 	return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
@@ -1338,10 +1337,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
-	runqueue_t *rq;
+	struct rq *rq;
 #ifdef CONFIG_SMP
-	unsigned long load, this_load;
 	struct sched_domain *sd, *this_sd = NULL;
+	unsigned long load, this_load;
 	int new_cpu;
 #endif
 
@@ -1577,9 +1576,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
  */
 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
+	struct rq *rq, *this_rq;
 	unsigned long flags;
 	int this_cpu, cpu;
-	runqueue_t *rq, *this_rq;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
@@ -1662,7 +1661,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 void fastcall sched_exit(struct task_struct *p)
 {
 	unsigned long flags;
-	runqueue_t *rq;
+	struct rq *rq;
 
 	/*
 	 * If the child was a (relative-) CPU hog then decrease
@@ -1693,7 +1692,7 @@ void fastcall sched_exit(struct task_struct *p)
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
-static inline void prepare_task_switch(runqueue_t *rq, struct task_struct *next)
+static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
 {
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
@@ -1714,7 +1713,7 @@ static inline void prepare_task_switch(runqueue_t *rq, struct task_struct *next)
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(runqueue_t *rq, struct task_struct *prev)
+static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
@@ -1755,7 +1754,8 @@ static inline void finish_task_switch(runqueue_t *rq, struct task_struct *prev)
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
-	runqueue_t *rq = this_rq();
+	struct rq *rq = this_rq();
+
 	finish_task_switch(rq, prev);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
@@ -1770,7 +1770,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
  * thread's register state.
  */
 static inline struct task_struct *
-context_switch(runqueue_t *rq, struct task_struct *prev,
+context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm = next->mm;
@@ -1883,7 +1883,7 @@ task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
-static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
@@ -1907,7 +1907,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
-static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
@@ -1921,7 +1921,7 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
-static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
@@ -1944,9 +1944,9 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
  */
 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 {
-	migration_req_t req;
-	runqueue_t *rq;
+	struct migration_req req;
 	unsigned long flags;
+	struct rq *rq;
 
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
@@ -1987,9 +1987,9 @@ void sched_exec(void)
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(runqueue_t *src_rq, prio_array_t *src_array,
-		      struct task_struct *p, runqueue_t *this_rq,
-		      prio_array_t *this_array, int this_cpu)
+static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+		      struct task_struct *p, struct rq *this_rq,
+		      struct prio_array *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
 	dec_nr_running(p, src_rq);
@@ -2010,7 +2010,7 @@ static void pull_task(runqueue_t *src_rq, prio_array_t *src_array,
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
-int can_migrate_task(struct task_struct *p, runqueue_t *rq, int this_cpu,
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum idle_type idle,
 		     int *all_pinned)
 {
@@ -2050,14 +2050,14 @@ int can_migrate_task(struct task_struct *p, runqueue_t *rq, int this_cpu,
  *
  * Called with both runqueues locked.
  */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum idle_type idle,
 		      int *all_pinned)
 {
 	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
 	    best_prio_seen, skip_for_load;
-	prio_array_t *array, *dst_array;
+	struct prio_array *array, *dst_array;
 	struct list_head *head, *curr;
 	struct task_struct *tmp;
 	long rem_load_move;
@@ -2212,7 +2212,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
-			runqueue_t *rq = cpu_rq(i);
+			struct rq *rq = cpu_rq(i);
 
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
@@ -2428,11 +2428,11 @@ ret:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *
+static struct rq *
 find_busiest_queue(struct sched_group *group, enum idle_type idle,
 		   unsigned long imbalance)
 {
-	runqueue_t *busiest = NULL, *rq;
+	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 
@@ -2468,13 +2468,13 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
  *
  * Called with this_rq unlocked.
  */
-static int load_balance(int this_cpu, runqueue_t *this_rq,
+static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum idle_type idle)
 {
 	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
-	runqueue_t *busiest;
+	struct rq *busiest;
 
 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !sched_smt_power_savings)
@@ -2596,10 +2596,10 @@ out_one_pinned:
  * this_rq is locked.
  */
 static int
-load_balance_newidle(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd)
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
-	runqueue_t *busiest = NULL;
+	struct rq *busiest = NULL;
 	unsigned long imbalance;
 	int nr_moved = 0;
 	int sd_idle = 0;
@@ -2657,7 +2657,7 @@ out_balanced:
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-static void idle_balance(int this_cpu, runqueue_t *this_rq)
+static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
 
@@ -2678,11 +2678,11 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq)
  *
  * Called with busiest_rq locked.
  */
-static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 {
-	struct sched_domain *sd;
-	runqueue_t *target_rq;
 	int target_cpu = busiest_rq->push_cpu;
+	struct sched_domain *sd;
+	struct rq *target_rq;
 
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
@@ -2736,7 +2736,7 @@ static inline unsigned long cpu_offset(int cpu)
 }
 
 static void
-rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle)
+rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
 {
 	unsigned long this_load, interval, j = cpu_offset(this_cpu);
 	struct sched_domain *sd;
@@ -2790,15 +2790,15 @@ rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle)
 /*
  * on UP we do not need to balance between CPUs:
  */
-static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
 {
 }
-static inline void idle_balance(int cpu, runqueue_t *rq)
+static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
 
-static inline int wake_priority_sleeper(runqueue_t *rq)
+static inline int wake_priority_sleeper(struct rq *rq)
 {
 	int ret = 0;
 
@@ -2826,7 +2826,7 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
  */
 static inline void
-update_cpu_clock(struct task_struct *p, runqueue_t *rq, unsigned long long now)
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
 	p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
 }
@@ -2858,7 +2858,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
  * increasing number of running tasks. We also ignore the interactivity
  * if a better static_prio task has expired:
  */
-static inline int expired_starving(runqueue_t *rq)
+static inline int expired_starving(struct rq *rq)
 {
 	if (rq->curr->static_prio > rq->best_expired_prio)
 		return 1;
@@ -2900,7 +2900,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	runqueue_t *rq = this_rq();
+	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
 	p->stime = cputime_add(p->stime, cputime);
@@ -2930,7 +2930,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp = cputime_to_cputime64(steal);
-	runqueue_t *rq = this_rq();
+	struct rq *rq = this_rq();
 
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
@@ -2954,7 +2954,7 @@ void scheduler_tick(void)
 	unsigned long long now = sched_clock();
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
-	runqueue_t *rq = this_rq();
+	struct rq *rq = cpu_rq(cpu);
 
 	update_cpu_clock(p, rq, now);
 
@@ -3043,7 +3043,7 @@ out:
 }
 
 #ifdef CONFIG_SCHED_SMT
-static inline void wakeup_busy_runqueue(runqueue_t *rq)
+static inline void wakeup_busy_runqueue(struct rq *rq)
 {
 	/* If an SMT runqueue is sleeping due to priority reasons wake it up */
 	if (rq->curr == rq->idle && rq->nr_running)
@@ -3069,7 +3069,7 @@ static void wake_sleeping_dependent(int this_cpu)
 		return;
 
 	for_each_cpu_mask(i, sd->span) {
-		runqueue_t *smt_rq = cpu_rq(i);
+		struct rq *smt_rq = cpu_rq(i);
 
 		if (i == this_cpu)
 			continue;
@@ -3099,7 +3099,7 @@ smt_slice(struct task_struct *p, struct sched_domain *sd)
  * need to be obeyed.
  */
 static int
-dependent_sleeper(int this_cpu, runqueue_t *this_rq, struct task_struct *p)
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int ret = 0, i;
@@ -3120,7 +3120,7 @@ dependent_sleeper(int this_cpu, runqueue_t *this_rq, struct task_struct *p)
 
 	for_each_cpu_mask(i, sd->span) {
 		struct task_struct *smt_curr;
-		runqueue_t *smt_rq;
+		struct rq *smt_rq;
 
 		if (i == this_cpu)
 			continue;
@@ -3166,7 +3166,7 @@ static inline void wake_sleeping_dependent(int this_cpu)
 {
 }
 static inline int
-dependent_sleeper(int this_cpu, runqueue_t *this_rq, struct task_struct *p)
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
 {
 	return 0;
 }
@@ -3221,13 +3221,13 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
+	struct prio_array *array;
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
 	int cpu, idx, new_prio;
-	prio_array_t *array;
 	long *switch_count;
-	runqueue_t *rq;
+	struct rq *rq;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -3787,9 +3787,9 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
+	struct prio_array *array;
 	unsigned long flags;
-	prio_array_t *array;
-	runqueue_t *rq;
+	struct rq *rq;
 	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3828,10 +3828,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
+	struct prio_array *array;
 	int old_prio, delta;
 	unsigned long flags;
-	prio_array_t *array;
-	runqueue_t *rq;
+	struct rq *rq;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -4012,9 +4012,9 @@ int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
 	int retval, oldprio, oldpolicy = -1;
-	prio_array_t *array;
+	struct prio_array *array;
 	unsigned long flags;
-	runqueue_t *rq;
+	struct rq *rq;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -4376,9 +4376,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
  */
 asmlinkage long sys_sched_yield(void)
 {
-	runqueue_t *rq = this_rq_lock();
-	prio_array_t *array = current->array;
-	prio_array_t *target = rq->expired;
+	struct rq *rq = this_rq_lock();
+	struct prio_array *array = current->array, *target = rq->expired;
 
 	schedstat_inc(rq, yld_cnt);
 	/*
@@ -4525,7 +4524,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-	struct runqueue *rq = &__raw_get_cpu_var(runqueues);
+	struct rq *rq = &__raw_get_cpu_var(runqueues);
 
 	atomic_inc(&rq->nr_iowait);
 	schedule();
@@ -4535,7 +4534,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-	struct runqueue *rq = &__raw_get_cpu_var(runqueues);
+	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 
 	atomic_inc(&rq->nr_iowait);
@@ -4743,7 +4742,7 @@ void show_state(void)
  */
 void __devinit init_idle(struct task_struct *idle, int cpu)
 {
-	runqueue_t *rq = cpu_rq(cpu);
+	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
 	idle->timestamp = sched_clock();
@@ -4782,7 +4781,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 /*
  * This is how migration works:
  *
- * 1) we queue a migration_req_t structure in the source CPU's
+ * 1) we queue a struct migration_req structure in the source CPU's
  *    runqueue and wake up that CPU's migration thread.
  * 2) we down() the locked semaphore => thread blocks.
  * 3) migration thread wakes up (implicitly it forces the migrated
@@ -4806,9 +4805,9 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
  */
 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
+	struct migration_req req;
 	unsigned long flags;
-	migration_req_t req;
-	runqueue_t *rq;
+	struct rq *rq;
 	int ret = 0;
 
 	rq = task_rq_lock(p, &flags);
@@ -4850,7 +4849,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-	runqueue_t *rq_dest, *rq_src;
+	struct rq *rq_dest, *rq_src;
 	int ret = 0;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
@@ -4896,15 +4895,15 @@ out:
 static int migration_thread(void *data)
 {
 	int cpu = (long)data;
-	runqueue_t *rq;
+	struct rq *rq;
 
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
+		struct migration_req *req;
 		struct list_head *head;
-		migration_req_t *req;
 
 		try_to_freeze();
 
@@ -4928,7 +4927,7 @@ static int migration_thread(void *data)
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}
-		req = list_entry(head->next, migration_req_t, list);
+		req = list_entry(head->next, struct migration_req, list);
 		list_del_init(head->next);
 
 		spin_unlock(&rq->lock);
@@ -4955,10 +4954,10 @@ wait_to_die:
 /* Figure out where task on dead CPU should go, use force if neccessary. */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-	runqueue_t *rq;
 	unsigned long flags;
-	int dest_cpu;
 	cpumask_t mask;
+	struct rq *rq;
+	int dest_cpu;
 
 restart:
 	/* On same node? */
@@ -4998,9 +4997,9 @@ restart:
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
-static void migrate_nr_uninterruptible(runqueue_t *rq_src)
+static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+	struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -5036,7 +5035,7 @@ static void migrate_live_tasks(int src_cpu)
 void sched_idle_next(void)
 {
 	int this_cpu = smp_processor_id();
-	runqueue_t *rq = cpu_rq(this_cpu);
+	struct rq *rq = cpu_rq(this_cpu);
 	struct task_struct *p = rq->idle;
 	unsigned long flags;
 
@@ -5074,7 +5073,7 @@ void idle_task_exit(void)
 
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
-	struct runqueue *rq = cpu_rq(dead_cpu);
+	struct rq *rq = cpu_rq(dead_cpu);
 
 	/* Must be exiting, otherwise would be on tasklist. */
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
@@ -5099,7 +5098,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
-	struct runqueue *rq = cpu_rq(dead_cpu);
+	struct rq *rq = cpu_rq(dead_cpu);
 	unsigned int arr, i;
 
 	for (arr = 0; arr < 2; arr++) {
@@ -5123,8 +5122,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	struct task_struct *p;
 	int cpu = (long)hcpu;
-	struct runqueue *rq;
 	unsigned long flags;
+	struct rq *rq;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
@@ -5176,9 +5175,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		 * the requestors. */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
-			migration_req_t *req;
+			struct migration_req *req;
+
 			req = list_entry(rq->migration_queue.next,
-					 migration_req_t, list);
+					 struct migration_req, list);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
@@ -5361,7 +5361,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
  */
 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
-	runqueue_t *rq = cpu_rq(cpu);
+	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
@@ -6690,8 +6690,8 @@ void __init sched_init(void)
 	int i, j, k;
 
 	for_each_possible_cpu(i) {
-		prio_array_t *array;
-		runqueue_t *rq;
+		struct prio_array *array;
+		struct rq *rq;
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
@@ -6764,10 +6764,10 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
+	struct prio_array *array;
 	struct task_struct *p;
-	prio_array_t *array;
 	unsigned long flags;
-	runqueue_t *rq;
+	struct rq *rq;
 
 	read_lock_irq(&tasklist_lock);
 	for_each_process(p) {
-- 
cgit v1.2.3


From 006f68b84fe19fc5015a8cf838a10d75f91f0218 Mon Sep 17 00:00:00 2001
From: Ralf Baechle DL5RB <ralf@linux-mips.org>
Date: Mon, 3 Jul 2006 19:30:18 -0700
Subject: [AX.25]: Reference counting for AX.25 routes.

In the past routes could be freed even though the were possibly in use ...

Signed-off-by: Ralf Baechle DL5RB <ralf@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ax25.h    | 24 +++++++++++++++---------
 net/ax25/ax25_ip.c    | 23 +++++++++++++++--------
 net/ax25/ax25_route.c | 49 ++++++++-----------------------------------------
 3 files changed, 38 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index 7cd528e9d668..69374cd1a857 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -182,14 +182,26 @@ typedef struct {
 
 typedef struct ax25_route {
 	struct ax25_route	*next;
-	atomic_t		ref;
+	atomic_t		refcount;
 	ax25_address		callsign;
 	struct net_device	*dev;
 	ax25_digi		*digipeat;
 	char			ip_mode;
-	struct timer_list	timer;
 } ax25_route;
 
+static inline void ax25_hold_route(ax25_route *ax25_rt)
+{
+	atomic_inc(&ax25_rt->refcount);
+}
+
+extern void __ax25_put_route(ax25_route *ax25_rt);
+
+static inline void ax25_put_route(ax25_route *ax25_rt)
+{
+	if (atomic_dec_and_test(&ax25_rt->refcount))
+		__ax25_put_route(ax25_rt);
+}
+
 typedef struct {
 	char			slave;			/* slave_mode?   */
 	struct timer_list	slave_timer;		/* timeout timer */
@@ -348,17 +360,11 @@ extern int  ax25_check_iframes_acked(ax25_cb *, unsigned short);
 extern void ax25_rt_device_down(struct net_device *);
 extern int  ax25_rt_ioctl(unsigned int, void __user *);
 extern struct file_operations ax25_route_fops;
+extern ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev);
 extern int  ax25_rt_autobind(ax25_cb *, ax25_address *);
-extern ax25_route *ax25_rt_find_route(ax25_route *, ax25_address *,
-	struct net_device *);
 extern struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *);
 extern void ax25_rt_free(void);
 
-static inline void ax25_put_route(ax25_route *ax25_rt)
-{
-	atomic_dec(&ax25_rt->ref);
-}
-
 /* ax25_std_in.c */
 extern int  ax25_std_frame_in(ax25_cb *, struct sk_buff *, int);
 
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 9be5c15e63d3..136c3aefa9de 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -103,11 +103,13 @@ int ax25_rebuild_header(struct sk_buff *skb)
 {
 	struct sk_buff *ourskb;
 	unsigned char *bp  = skb->data;
-	struct net_device *dev;
+	ax25_route *route;
+	struct net_device *dev = NULL;
 	ax25_address *src, *dst;
+	ax25_digi *digipeat = NULL;
 	ax25_dev *ax25_dev;
-	ax25_route _route, *route = &_route;
 	ax25_cb *ax25;
+	char ip_mode = ' ';
 
 	dst = (ax25_address *)(bp + 1);
 	src = (ax25_address *)(bp + 8);
@@ -115,8 +117,12 @@ int ax25_rebuild_header(struct sk_buff *skb)
   	if (arp_find(bp + 1, skb))
   		return 1;
 
-	route = ax25_rt_find_route(route, dst, NULL);
-	dev      = route->dev;
+	route = ax25_get_route(dst, NULL);
+	if (route) {
+		digipeat = route->digipeat;
+		dev = route->dev;
+		ip_mode = route->ip_mode;
+	};
 
 	if (dev == NULL)
 		dev = skb->dev;
@@ -126,7 +132,7 @@ int ax25_rebuild_header(struct sk_buff *skb)
 	}
 
 	if (bp[16] == AX25_P_IP) {
-		if (route->ip_mode == 'V' || (route->ip_mode == ' ' && ax25_dev->values[AX25_VALUES_IPDEFMODE])) {
+		if (ip_mode == 'V' || (ip_mode == ' ' && ax25_dev->values[AX25_VALUES_IPDEFMODE])) {
 			/*
 			 *	We copy the buffer and release the original thereby
 			 *	keeping it straight
@@ -172,7 +178,7 @@ int ax25_rebuild_header(struct sk_buff *skb)
 			    ourskb, 
 			    ax25_dev->values[AX25_VALUES_PACLEN], 
 			    &src_c,
-			    &dst_c, route->digipeat, dev);
+			    &dst_c, digipeat, dev);
 			if (ax25) {
 				ax25_cb_put(ax25);
 			}
@@ -190,7 +196,7 @@ int ax25_rebuild_header(struct sk_buff *skb)
 
 	skb_pull(skb, AX25_KISS_HEADER_LEN);
 
-	if (route->digipeat != NULL) {
+	if (digipeat != NULL) {
 		if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) {
 			kfree_skb(skb);
 			goto put;
@@ -202,7 +208,8 @@ int ax25_rebuild_header(struct sk_buff *skb)
 	ax25_queue_xmit(skb, dev);
 
 put:
-	ax25_put_route(route);
+	if (route)
+		ax25_put_route(route);
 
   	return 1;
 }
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 5ac98250797b..51b7bdaf27eb 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -41,8 +41,6 @@
 static ax25_route *ax25_route_list;
 static DEFINE_RWLOCK(ax25_route_lock);
 
-static ax25_route *ax25_get_route(ax25_address *, struct net_device *);
-
 void ax25_rt_device_down(struct net_device *dev)
 {
 	ax25_route *s, *t, *ax25_rt;
@@ -115,7 +113,7 @@ static int ax25_rt_add(struct ax25_routes_struct *route)
 		return -ENOMEM;
 	}
 
-	atomic_set(&ax25_rt->ref, 0);
+	atomic_set(&ax25_rt->refcount, 1);
 	ax25_rt->callsign     = route->dest_addr;
 	ax25_rt->dev          = ax25_dev->dev;
 	ax25_rt->digipeat     = NULL;
@@ -140,23 +138,10 @@ static int ax25_rt_add(struct ax25_routes_struct *route)
 	return 0;
 }
 
-static void ax25_rt_destroy(ax25_route *ax25_rt)
+void __ax25_put_route(ax25_route *ax25_rt)
 {
-	if (atomic_read(&ax25_rt->ref) == 0) {
-		kfree(ax25_rt->digipeat);
-		kfree(ax25_rt);
-		return;
-	}
-
-	/*
-	 * Uh...  Route is still in use; we can't yet destroy it.  Retry later.
-	 */
-	init_timer(&ax25_rt->timer);
-	ax25_rt->timer.data	= (unsigned long) ax25_rt;
-	ax25_rt->timer.function	= (void *) ax25_rt_destroy;
-	ax25_rt->timer.expires	= jiffies + 5 * HZ;
-
-	add_timer(&ax25_rt->timer);
+	kfree(ax25_rt->digipeat);
+	kfree(ax25_rt);
 }
 
 static int ax25_rt_del(struct ax25_routes_struct *route)
@@ -177,12 +162,12 @@ static int ax25_rt_del(struct ax25_routes_struct *route)
 		    ax25cmp(&route->dest_addr, &s->callsign) == 0) {
 			if (ax25_route_list == s) {
 				ax25_route_list = s->next;
-				ax25_rt_destroy(s);
+				ax25_put_route(s);
 			} else {
 				for (t = ax25_route_list; t != NULL; t = t->next) {
 					if (t->next == s) {
 						t->next = s->next;
-						ax25_rt_destroy(s);
+						ax25_put_route(s);
 						break;
 					}
 				}
@@ -362,7 +347,7 @@ struct file_operations ax25_route_fops = {
  *
  *	Only routes with a reference count of zero can be destroyed.
  */
-static ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
+ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
 {
 	ax25_route *ax25_spe_rt = NULL;
 	ax25_route *ax25_def_rt = NULL;
@@ -392,7 +377,7 @@ static ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
 		ax25_rt = ax25_spe_rt;
 
 	if (ax25_rt != NULL)
-		atomic_inc(&ax25_rt->ref);
+		ax25_hold_route(ax25_rt);
 
 	read_unlock(&ax25_route_lock);
 
@@ -467,24 +452,6 @@ put:
 	return 0;
 }
 
-ax25_route *ax25_rt_find_route(ax25_route * route, ax25_address *addr,
-	struct net_device *dev)
-{
-	ax25_route *ax25_rt;
-
-	if ((ax25_rt = ax25_get_route(addr, dev)))
-		return ax25_rt;
-
-	route->next     = NULL;
-	atomic_set(&route->ref, 1);
-	route->callsign = *addr;
-	route->dev      = dev;
-	route->digipeat = NULL;
-	route->ip_mode  = ' ';
-
-	return route;
-}
-
 struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
 	ax25_address *dest, ax25_digi *digi)
 {
-- 
cgit v1.2.3


From fe4ada2d6f0b746246e9b5bf0f4f2e4d3a07d26e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 3 Jul 2006 19:44:51 -0700
Subject: [IOAT]: fix header file kernel-doc

Fix kernel-doc problems in include/linux/dmaengine.h:
- add some fields/parameters
- expand some descriptions
- fix typos

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dmaengine.h | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 272010a6078a..c94d8f1d62e5 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -44,7 +44,7 @@ enum dma_event {
 };
 
 /**
- * typedef dma_cookie_t
+ * typedef dma_cookie_t - an opaque DMA cookie
  *
  * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
  */
@@ -80,14 +80,14 @@ struct dma_chan_percpu {
 
 /**
  * struct dma_chan - devices supply DMA channels, clients use them
- * @client: ptr to the client user of this chan, will be NULL when unused
- * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @client: ptr to the client user of this chan, will be %NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !%NULL
  * @cookie: last cookie value returned to client
- * @chan_id:
- * @class_dev:
+ * @chan_id: channel ID for sysfs
+ * @class_dev: class device for sysfs
  * @refcount: kref, used in "bigref" slow-mode
- * @slow_ref:
- * @rcu:
+ * @slow_ref: indicates that the DMA channel is free
+ * @rcu: the DMA channel's RCU head
  * @client_node: used to add this to the client chan list
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
@@ -162,10 +162,17 @@ struct dma_client {
  * @chancnt: how many DMA channels are supported
  * @channels: the list of struct dma_chan
  * @global_node: list_head for global dma_device_list
- * @refcount:
- * @done:
- * @dev_id:
- * Other func ptrs: used to make use of this device's capabilities
+ * @refcount: reference count
+ * @done: IO completion struct
+ * @dev_id: unique device ID
+ * @device_alloc_chan_resources: allocate resources and return the
+ *	number of allocated descriptors
+ * @device_free_chan_resources: release DMA channel's resources
+ * @device_memcpy_buf_to_buf: memcpy buf pointer to buf pointer
+ * @device_memcpy_buf_to_pg: memcpy buf pointer to struct page
+ * @device_memcpy_pg_to_pg: memcpy struct page/offset to struct page/offset
+ * @device_memcpy_complete: poll the status of an IOAT DMA transaction
+ * @device_memcpy_issue_pending: push appended descriptors to hardware
  */
 struct dma_device {
 
@@ -211,7 +218,7 @@ void dma_async_client_chan_request(struct dma_client *client,
  * Both @dest and @src must be mappable to a bus address according to the
  * DMA mapping API rules for streaming mappings.
  * Both @dest and @src must stay memory resident (kernel memory or locked
- * user space pages)
+ * user space pages).
  */
 static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len)
@@ -225,7 +232,7 @@ static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 }
 
 /**
- * dma_async_memcpy_buf_to_pg - offloaded copy
+ * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
  * @chan: DMA channel to offload copy to
  * @page: destination page
  * @offset: offset in page to copy to
@@ -250,18 +257,18 @@ static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
 }
 
 /**
- * dma_async_memcpy_buf_to_pg - offloaded copy
+ * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
  * @chan: DMA channel to offload copy to
- * @dest_page: destination page
+ * @dest_pg: destination page
  * @dest_off: offset in page to copy to
- * @src_page: source page
+ * @src_pg: source page
  * @src_off: offset in page to copy from
  * @len: length
  *
  * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
  * address according to the DMA mapping API rules for streaming mappings.
  * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
- * (kernel memory or locked user space pages)
+ * (kernel memory or locked user space pages).
  */
 static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
 	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
@@ -278,7 +285,7 @@ static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
 
 /**
  * dma_async_memcpy_issue_pending - flush pending copies to HW
- * @chan:
+ * @chan: target DMA channel
  *
  * This allows drivers to push copies to HW in batches,
  * reducing MMIO writes where possible.
-- 
cgit v1.2.3


From 4bdbf6c033ba05bff65f69989baccd7103c5a530 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 3 Jul 2006 19:47:27 -0700
Subject: [NET]: add+use poison defines

Add and use poison defines in net/.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/poison.h          | 4 ++++
 net/atm/clip.c                  | 3 ++-
 net/ipv6/netfilter/ip6_tables.c | 3 ++-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index a5347c02432e..05d7d0fd6bdb 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -45,6 +45,10 @@
 /********** drivers/atm/ **********/
 #define ATM_POISON_FREE		0x12
 
+/********** net/ **********/
+#define NEIGHBOR_DEAD		0xdeadbeef
+#define NETFILTER_LINK_POISON	0xdead57ac
+
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 87a454f5c89c..121bf6f49148 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -23,6 +23,7 @@
 #include <linux/if.h> /* for IFF_UP */
 #include <linux/inetdevice.h>
 #include <linux/bitops.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
@@ -266,7 +267,7 @@ static void clip_neigh_destroy(struct neighbour *neigh)
 	DPRINTK("clip_neigh_destroy (neigh %p)\n", neigh);
 	if (NEIGH2ENTRY(neigh)->vccs)
 		printk(KERN_CRIT "clip_neigh_destroy: vccs != NULL !!!\n");
-	NEIGH2ENTRY(neigh)->vccs = (void *) 0xdeadbeef;
+	NEIGH2ENTRY(neigh)->vccs = (void *) NEIGHBOR_DEAD;
 }
 
 static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7ef143c0ebf6..f26898b00347 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -25,6 +25,7 @@
 #include <linux/vmalloc.h>
 #include <linux/netdevice.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 #include <linux/icmpv6.h>
 #include <net/ipv6.h>
 #include <asm/uaccess.h>
@@ -376,7 +377,7 @@ ip6t_do_table(struct sk_buff **pskb,
 	} while (!hotdrop);
 
 #ifdef CONFIG_NETFILTER_DEBUG
-	((struct ip6t_entry *)table_base)->comefrom = 0xdead57ac;
+	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
 	read_unlock_bh(&table->lock);
 
-- 
cgit v1.2.3


From 3c6b377321678c649f9b3c66da0149975c614102 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 3 Jul 2006 19:48:25 -0700
Subject: [ATM]: add+use poison defines

ATM: add and use POISON define values.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/ambassador.c | 3 ++-
 drivers/atm/idt77252.c   | 3 ++-
 include/linux/poison.h   | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index d3b426313a41..4521a249dd56 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -31,6 +31,7 @@
 #include <linux/atmdev.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/poison.h>
 
 #include <asm/atomic.h>
 #include <asm/io.h>
@@ -1995,7 +1996,7 @@ static int __devinit ucode_init (loader_block * lb, amb_dev * dev) {
     }
     i += 1;
   }
-  if (*pointer == 0xdeadbeef) {
+  if (*pointer == ATM_POISON) {
     return loader_start (lb, dev, ucode_start);
   } else {
     // cast needed as there is no %? for pointer differnces
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 5d1c6c95262c..b0369bb20f08 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -35,6 +35,7 @@ static char const rcsid[] =
 
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/skbuff.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
@@ -3657,7 +3658,7 @@ probe_sram(struct idt77252_dev *card)
 	writel(SAR_CMD_WRITE_SRAM | (0 << 2), SAR_REG_CMD);
 
 	for (addr = 0x4000; addr < 0x80000; addr += 0x4000) {
-		writel(0xdeadbeef, SAR_REG_DR0);
+		writel(ATM_POISON, SAR_REG_DR0);
 		writel(SAR_CMD_WRITE_SRAM | (addr << 2), SAR_REG_CMD);
 
 		writel(SAR_CMD_READ_SRAM | (0 << 2), SAR_REG_CMD);
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 05d7d0fd6bdb..3e628f990fdf 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -44,6 +44,7 @@
 
 /********** drivers/atm/ **********/
 #define ATM_POISON_FREE		0x12
+#define ATM_POISON		0xdeadbeef
 
 /********** net/ **********/
 #define NEIGHBOR_DEAD		0xdeadbeef
-- 
cgit v1.2.3


From da1f519851d1c66331363253f364bdb5d924ea96 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 3 Jul 2006 10:02:29 +0200
Subject: [Bluetooth] Correct SCO buffer size on request

This patch introduces a quirk that allows the drivers to tell the host
to correct the SCO buffer size values.

Signed-off-by: Olivier Galibert <galibert@pobox.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_usb.c |  6 ++++++
 drivers/bluetooth/hci_usb.h |  1 +
 include/net/bluetooth/hci.h |  3 ++-
 net/bluetooth/hci_event.c   | 14 +++++++++++---
 4 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c
index 987a3dff8af9..6a0c2230f82f 100644
--- a/drivers/bluetooth/hci_usb.c
+++ b/drivers/bluetooth/hci_usb.c
@@ -122,6 +122,9 @@ static struct usb_device_id blacklist_ids[] = {
 	/* RTX Telecom based adapter with buggy SCO support */
 	{ USB_DEVICE(0x0400, 0x0807), .driver_info = HCI_BROKEN_ISOC },
 
+	/* Belkin F8T012 */
+	{ USB_DEVICE(0x050d, 0x0012), .driver_info = HCI_WRONG_SCO_MTU },
+
 	/* Digianswer devices */
 	{ USB_DEVICE(0x08fd, 0x0001), .driver_info = HCI_DIGIANSWER },
 	{ USB_DEVICE(0x08fd, 0x0002), .driver_info = HCI_IGNORE },
@@ -987,6 +990,9 @@ static int hci_usb_probe(struct usb_interface *intf, const struct usb_device_id
 	if (reset || id->driver_info & HCI_RESET)
 		set_bit(HCI_QUIRK_RESET_ON_INIT, &hdev->quirks);
 
+	if (id->driver_info & HCI_WRONG_SCO_MTU)
+		set_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks);
+
 	if (id->driver_info & HCI_SNIFFER) {
 		if (le16_to_cpu(udev->descriptor.bcdDevice) > 0x997)
 			set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks);
diff --git a/drivers/bluetooth/hci_usb.h b/drivers/bluetooth/hci_usb.h
index 37100a6ea1a8..963fc55cdc85 100644
--- a/drivers/bluetooth/hci_usb.h
+++ b/drivers/bluetooth/hci_usb.h
@@ -35,6 +35,7 @@
 #define HCI_SNIFFER		0x10
 #define HCI_BCM92035		0x20
 #define HCI_BROKEN_ISOC		0x40
+#define HCI_WRONG_SCO_MTU	0x80
 
 #define HCI_MAX_IFACE_NUM	3
 
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index b06a2d2f63d2..99c53f6b8252 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -54,7 +54,8 @@
 /* HCI device quirks */
 enum {
 	HCI_QUIRK_RESET_ON_INIT,
-	HCI_QUIRK_RAW_DEVICE
+	HCI_QUIRK_RAW_DEVICE,
+	HCI_QUIRK_FIXUP_BUFFER_SIZE
 };
 
 /* HCI device flags */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 618bacee1b1c..f41cf1a6c11a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -319,9 +319,17 @@ static void hci_cc_info_param(struct hci_dev *hdev, __u16 ocf, struct sk_buff *s
 		}
 
 		hdev->acl_mtu  = __le16_to_cpu(bs->acl_mtu);
-		hdev->sco_mtu  = bs->sco_mtu ? bs->sco_mtu : 64;
-		hdev->acl_pkts = hdev->acl_cnt = __le16_to_cpu(bs->acl_max_pkt);
-		hdev->sco_pkts = hdev->sco_cnt = __le16_to_cpu(bs->sco_max_pkt);
+		hdev->sco_mtu  = bs->sco_mtu;
+		hdev->acl_pkts = __le16_to_cpu(bs->acl_max_pkt);
+		hdev->sco_pkts = __le16_to_cpu(bs->sco_max_pkt);
+
+		if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) {
+			hdev->sco_mtu  = 64;
+			hdev->sco_pkts = 8;
+		}
+
+		hdev->acl_cnt = hdev->acl_pkts;
+		hdev->sco_cnt = hdev->sco_pkts;
 
 		BT_DBG("%s mtu: acl %d, sco %d max_pkt: acl %d, sco %d", hdev->name,
 			hdev->acl_mtu, hdev->sco_mtu, hdev->acl_pkts, hdev->sco_pkts);
-- 
cgit v1.2.3


From 04837f6447c7f3ef114cda1ad761822dedbff8cf Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 3 Jul 2006 10:02:33 +0200
Subject: [Bluetooth] Add automatic sniff mode support

This patch introduces the automatic sniff mode feature. This allows
the host to switch idle connections into sniff mode to safe power.

Signed-off-by: Ulisses Furquim <ulissesf@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h      |  70 +++++++++++---
 include/net/bluetooth/hci_core.h |  54 ++++++-----
 net/bluetooth/af_bluetooth.c     |   2 +-
 net/bluetooth/hci_conn.c         | 100 +++++++++++++++++---
 net/bluetooth/hci_core.c         |  34 +++++--
 net/bluetooth/hci_event.c        | 192 +++++++++++++++++++++++++++++++++------
 net/bluetooth/hci_sysfs.c        |  88 ++++++++++++++++++
 7 files changed, 454 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 99c53f6b8252..b2bdb1aa0429 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -101,9 +101,10 @@ enum {
 #define HCIINQUIRY	_IOR('H', 240, int)
 
 /* HCI timeouts */
-#define HCI_CONN_TIMEOUT	(HZ * 40)
-#define HCI_DISCONN_TIMEOUT	(HZ * 2)
-#define HCI_CONN_IDLE_TIMEOUT	(HZ * 60)
+#define HCI_CONNECT_TIMEOUT	(40000)	/* 40 seconds */
+#define HCI_DISCONN_TIMEOUT	(2000)	/* 2 seconds */
+#define HCI_IDLE_TIMEOUT	(6000)	/* 6 seconds */
+#define HCI_INIT_TIMEOUT	(10000)	/* 10 seconds */
 
 /* HCI Packet types */
 #define HCI_COMMAND_PKT		0x01
@@ -145,7 +146,7 @@ enum {
 #define LMP_TACCURACY	0x10
 #define LMP_RSWITCH	0x20
 #define LMP_HOLD	0x40
-#define LMP_SNIF	0x80
+#define LMP_SNIFF	0x80
 
 #define LMP_PARK	0x01
 #define LMP_RSSI	0x02
@@ -160,13 +161,21 @@ enum {
 #define LMP_PSCHEME	0x02
 #define LMP_PCONTROL	0x04
 
+#define LMP_SNIFF_SUBR	0x02
+
+/* Connection modes */
+#define HCI_CM_ACTIVE	0x0000
+#define HCI_CM_HOLD	0x0001
+#define HCI_CM_SNIFF	0x0002
+#define HCI_CM_PARK	0x0003
+
 /* Link policies */
 #define HCI_LP_RSWITCH	0x0001
 #define HCI_LP_HOLD	0x0002
 #define HCI_LP_SNIFF	0x0004
 #define HCI_LP_PARK	0x0008
 
-/* Link mode */
+/* Link modes */
 #define HCI_LM_ACCEPT	0x8000
 #define HCI_LM_MASTER	0x0001
 #define HCI_LM_AUTH	0x0002
@@ -192,7 +201,7 @@ struct hci_rp_read_loc_version {
 } __attribute__ ((packed));
 
 #define OCF_READ_LOCAL_FEATURES	0x0003
-struct hci_rp_read_loc_features {
+struct hci_rp_read_local_features {
 	__u8 status;
 	__u8 features[8];
 } __attribute__ ((packed));
@@ -376,17 +385,32 @@ struct hci_cp_change_conn_link_key {
 } __attribute__ ((packed));
 
 #define OCF_READ_REMOTE_FEATURES 0x001B
-struct hci_cp_read_rmt_features {
+struct hci_cp_read_remote_features {
 	__le16   handle;
 } __attribute__ ((packed));
 
 #define OCF_READ_REMOTE_VERSION 0x001D
-struct hci_cp_read_rmt_version {
+struct hci_cp_read_remote_version {
 	__le16   handle;
 } __attribute__ ((packed));
 
 /* Link Policy */
-#define OGF_LINK_POLICY	 0x02   
+#define OGF_LINK_POLICY	0x02   
+
+#define OCF_SNIFF_MODE		0x0003
+struct hci_cp_sniff_mode {
+	__le16   handle;
+	__le16   max_interval;
+	__le16   min_interval;
+	__le16   attempt;
+	__le16   timeout;
+} __attribute__ ((packed));
+
+#define OCF_EXIT_SNIFF_MODE	0x0004
+struct hci_cp_exit_sniff_mode {
+	__le16   handle;
+} __attribute__ ((packed));
+
 #define OCF_ROLE_DISCOVERY	0x0009
 struct hci_cp_role_discovery {
 	__le16   handle;
@@ -407,7 +431,7 @@ struct hci_rp_read_link_policy {
 	__le16   policy;
 } __attribute__ ((packed));
 
-#define OCF_SWITCH_ROLE	0x000B
+#define OCF_SWITCH_ROLE		0x000B
 struct hci_cp_switch_role {
 	bdaddr_t bdaddr;
 	__u8     role;
@@ -423,6 +447,14 @@ struct hci_rp_write_link_policy {
 	__le16   handle;
 } __attribute__ ((packed));
 
+#define OCF_SNIFF_SUBRATE	0x0011
+struct hci_cp_sniff_subrate {
+	__le16   handle;
+	__le16   max_latency;
+	__le16   min_remote_timeout;
+	__le16   min_local_timeout;
+} __attribute__ ((packed));
+
 /* Status params */
 #define OGF_STATUS_PARAM	0x05
 
@@ -582,15 +614,15 @@ struct hci_ev_link_key_notify {
 	__u8	 key_type;
 } __attribute__ ((packed));
 
-#define HCI_EV_RMT_FEATURES	0x0B
-struct hci_ev_rmt_features {
+#define HCI_EV_REMOTE_FEATURES	0x0B
+struct hci_ev_remote_features {
 	__u8     status;
 	__le16   handle;
 	__u8     features[8];
 } __attribute__ ((packed));
 
-#define HCI_EV_RMT_VERSION	0x0C
-struct hci_ev_rmt_version {
+#define HCI_EV_REMOTE_VERSION	0x0C
+struct hci_ev_remote_version {
 	__u8     status;
 	__le16   handle;
 	__u8     lmp_ver;
@@ -611,6 +643,16 @@ struct hci_ev_pscan_rep_mode {
 	__u8     pscan_rep_mode;
 } __attribute__ ((packed));
 
+#define HCI_EV_SNIFF_SUBRATE	0x2E
+struct hci_ev_sniff_subrate {
+	__u8     status;
+	__le16   handle;
+	__le16   max_tx_latency;
+	__le16   max_rx_latency;
+	__le16   max_remote_timeout;
+	__le16   max_local_timeout;
+} __attribute__ ((packed));
+
 /* Internal events generated by Bluetooth stack */
 #define HCI_EV_STACK_INTERNAL	0xFD
 struct hci_ev_stack_internal {
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index bb9f81dc8723..f6852707bd64 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -31,10 +31,7 @@
 #define HCI_PROTO_L2CAP	0
 #define HCI_PROTO_SCO	1
 
-#define HCI_INIT_TIMEOUT (HZ * 10)
-
 /* HCI Core structures */
-
 struct inquiry_data {
 	bdaddr_t	bdaddr;
 	__u8		pscan_rep_mode;
@@ -81,6 +78,10 @@ struct hci_dev {
 	__u16		link_policy;
 	__u16		link_mode;
 
+	__u32		idle_timeout;
+	__u16		sniff_min_interval;
+	__u16		sniff_max_interval;
+
 	unsigned long	quirks;
 
 	atomic_t	cmd_cnt;
@@ -145,18 +146,24 @@ struct hci_conn {
 	bdaddr_t	 dst;
 	__u16		 handle;
 	__u16		 state;
+	__u8             mode;
 	__u8		 type;
 	__u8		 out;
 	__u8		 dev_class[3];
+	__u8             features[8];
+	__u16            interval;
+	__u16            link_policy;
 	__u32		 link_mode;
+	__u8             power_save;
 	unsigned long	 pend;
-	
+
 	unsigned int	 sent;
-	
+
 	struct sk_buff_head data_q;
 
-	struct timer_list timer;
-	
+	struct timer_list disc_timer;
+	struct timer_list idle_timer;
+
 	struct hci_dev	*hdev;
 	void		*l2cap_data;
 	void		*sco_data;
@@ -211,7 +218,8 @@ void hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data);
 enum {
 	HCI_CONN_AUTH_PEND,
 	HCI_CONN_ENCRYPT_PEND,
-	HCI_CONN_RSWITCH_PEND
+	HCI_CONN_RSWITCH_PEND,
+	HCI_CONN_MODE_CHANGE_PEND,
 };
 
 static inline void hci_conn_hash_init(struct hci_dev *hdev)
@@ -286,31 +294,27 @@ int hci_conn_encrypt(struct hci_conn *conn);
 int hci_conn_change_link_key(struct hci_conn *conn);
 int hci_conn_switch_role(struct hci_conn *conn, uint8_t role);
 
-static inline void hci_conn_set_timer(struct hci_conn *conn, unsigned long timeout)
-{
-	mod_timer(&conn->timer, jiffies + timeout);
-}
-
-static inline void hci_conn_del_timer(struct hci_conn *conn)
-{
-	del_timer(&conn->timer);
-}
+void hci_conn_enter_active_mode(struct hci_conn *conn);
+void hci_conn_enter_sniff_mode(struct hci_conn *conn);
 
 static inline void hci_conn_hold(struct hci_conn *conn)
 {
 	atomic_inc(&conn->refcnt);
-	hci_conn_del_timer(conn);
+	del_timer(&conn->disc_timer);
 }
 
 static inline void hci_conn_put(struct hci_conn *conn)
 {
 	if (atomic_dec_and_test(&conn->refcnt)) {
+		unsigned long timeo;
 		if (conn->type == ACL_LINK) {
-			unsigned long timeo = (conn->out) ?
-				HCI_DISCONN_TIMEOUT : HCI_DISCONN_TIMEOUT * 2;
-			hci_conn_set_timer(conn, timeo);
+			timeo = msecs_to_jiffies(HCI_DISCONN_TIMEOUT);
+			if (!conn->out)
+				timeo *= 2;
+			del_timer(&conn->idle_timer);
 		} else
-			hci_conn_set_timer(conn, HZ / 100);
+			timeo = msecs_to_jiffies(10);
+		mod_timer(&conn->disc_timer, jiffies + timeo);
 	}
 }
 
@@ -411,8 +415,10 @@ void hci_unregister_sysfs(struct hci_dev *hdev);
 #define SET_HCIDEV_DEV(hdev, pdev) ((hdev)->class_dev.dev = (pdev))
 
 /* ----- LMP capabilities ----- */
-#define lmp_rswitch_capable(dev) (dev->features[0] & LMP_RSWITCH)
-#define lmp_encrypt_capable(dev) (dev->features[0] & LMP_ENCRYPT)
+#define lmp_rswitch_capable(dev)   ((dev)->features[0] & LMP_RSWITCH)
+#define lmp_encrypt_capable(dev)   ((dev)->features[0] & LMP_ENCRYPT)
+#define lmp_sniff_capable(dev)     ((dev)->features[0] & LMP_SNIFF)
+#define lmp_sniffsubr_capable(dev) ((dev)->features[5] & LMP_SNIFF_SUBR)
 
 /* ----- HCI protocols ----- */
 struct hci_proto {
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 51f867062e1d..729461fcfe99 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -48,7 +48,7 @@
 #define BT_DBG(D...)
 #endif
 
-#define VERSION "2.8"
+#define VERSION "2.9"
 
 /* Bluetooth sockets */
 #define BT_MAX_PROTO	8
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 5c0c2b1ef34a..420ed4d7e57e 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -115,8 +115,8 @@ void hci_add_sco(struct hci_conn *conn, __u16 handle)
 
 static void hci_conn_timeout(unsigned long arg)
 {
-	struct hci_conn *conn = (void *)arg;
-	struct hci_dev  *hdev = conn->hdev;
+	struct hci_conn *conn = (void *) arg;
+	struct hci_dev *hdev = conn->hdev;
 
 	BT_DBG("conn %p state %d", conn, conn->state);
 
@@ -132,11 +132,13 @@ static void hci_conn_timeout(unsigned long arg)
 	return;
 }
 
-static void hci_conn_init_timer(struct hci_conn *conn)
+static void hci_conn_idle(unsigned long arg)
 {
-	init_timer(&conn->timer);
-	conn->timer.function = hci_conn_timeout;
-	conn->timer.data = (unsigned long)conn;
+	struct hci_conn *conn = (void *) arg;
+
+	BT_DBG("conn %p mode %d", conn, conn->mode);
+
+	hci_conn_enter_sniff_mode(conn);
 }
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
@@ -145,17 +147,27 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
 
 	BT_DBG("%s dst %s", hdev->name, batostr(dst));
 
-	if (!(conn = kmalloc(sizeof(struct hci_conn), GFP_ATOMIC)))
+	conn = kzalloc(sizeof(struct hci_conn), GFP_ATOMIC);
+	if (!conn)
 		return NULL;
-	memset(conn, 0, sizeof(struct hci_conn));
 
 	bacpy(&conn->dst, dst);
-	conn->type   = type;
 	conn->hdev   = hdev;
+	conn->type   = type;
+	conn->mode   = HCI_CM_ACTIVE;
 	conn->state  = BT_OPEN;
 
+	conn->power_save = 1;
+
 	skb_queue_head_init(&conn->data_q);
-	hci_conn_init_timer(conn);
+
+	init_timer(&conn->disc_timer);
+	conn->disc_timer.function = hci_conn_timeout;
+	conn->disc_timer.data = (unsigned long) conn;
+
+	init_timer(&conn->idle_timer);
+	conn->idle_timer.function = hci_conn_idle;
+	conn->idle_timer.data = (unsigned long) conn;
 
 	atomic_set(&conn->refcnt, 0);
 
@@ -178,7 +190,9 @@ int hci_conn_del(struct hci_conn *conn)
 
 	BT_DBG("%s conn %p handle %d", hdev->name, conn, conn->handle);
 
-	hci_conn_del_timer(conn);
+	del_timer(&conn->idle_timer);
+
+	del_timer(&conn->disc_timer);
 
 	if (conn->type == SCO_LINK) {
 		struct hci_conn *acl = conn->link;
@@ -364,6 +378,70 @@ int hci_conn_switch_role(struct hci_conn *conn, uint8_t role)
 }
 EXPORT_SYMBOL(hci_conn_switch_role);
 
+/* Enter active mode */
+void hci_conn_enter_active_mode(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("conn %p mode %d", conn, conn->mode);
+
+	if (test_bit(HCI_RAW, &hdev->flags))
+		return;
+
+	if (conn->mode != HCI_CM_SNIFF || !conn->power_save)
+		goto timer;
+
+	if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
+		struct hci_cp_exit_sniff_mode cp;
+		cp.handle = __cpu_to_le16(conn->handle);
+		hci_send_cmd(hdev, OGF_LINK_POLICY,
+				OCF_EXIT_SNIFF_MODE, sizeof(cp), &cp);
+	}
+
+timer:
+	if (hdev->idle_timeout > 0)
+		mod_timer(&conn->idle_timer,
+			jiffies + msecs_to_jiffies(hdev->idle_timeout));
+}
+
+/* Enter sniff mode */
+void hci_conn_enter_sniff_mode(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("conn %p mode %d", conn, conn->mode);
+
+	if (test_bit(HCI_RAW, &hdev->flags))
+		return;
+
+	if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn))
+		return;
+
+	if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF))
+		return;
+
+	if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) {
+		struct hci_cp_sniff_subrate cp;
+		cp.handle             = __cpu_to_le16(conn->handle);
+		cp.max_latency        = __constant_cpu_to_le16(0);
+		cp.min_remote_timeout = __constant_cpu_to_le16(0);
+		cp.min_local_timeout  = __constant_cpu_to_le16(0);
+		hci_send_cmd(hdev, OGF_LINK_POLICY,
+				OCF_SNIFF_SUBRATE, sizeof(cp), &cp);
+	}
+
+	if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
+		struct hci_cp_sniff_mode cp;
+		cp.handle       = __cpu_to_le16(conn->handle);
+		cp.max_interval = __cpu_to_le16(hdev->sniff_max_interval);
+		cp.min_interval = __cpu_to_le16(hdev->sniff_min_interval);
+		cp.attempt      = __constant_cpu_to_le16(4);
+		cp.timeout      = __constant_cpu_to_le16(1);
+		hci_send_cmd(hdev, OGF_LINK_POLICY,
+				OCF_SNIFF_MODE, sizeof(cp), &cp);
+	}
+}
+
 /* Drop all connection on the device */
 void hci_conn_hash_flush(struct hci_dev *hdev)
 {
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index f67240beb0dd..3f9f1565bf06 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -411,7 +411,7 @@ int hci_inquiry(void __user *arg)
 	}
 	hci_dev_unlock_bh(hdev);
 
-	timeo = ir.length * 2 * HZ;
+	timeo = ir.length * msecs_to_jiffies(2000);
 	if (do_inquiry && (err = hci_request(hdev, hci_inq_req, (unsigned long)&ir, timeo)) < 0)
 		goto done;
 
@@ -479,7 +479,8 @@ int hci_dev_open(__u16 dev)
 		set_bit(HCI_INIT, &hdev->flags);
 
 		//__hci_request(hdev, hci_reset_req, 0, HZ);
-		ret = __hci_request(hdev, hci_init_req, 0, HCI_INIT_TIMEOUT);
+		ret = __hci_request(hdev, hci_init_req, 0,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
 
 		clear_bit(HCI_INIT, &hdev->flags);
 	}
@@ -546,7 +547,8 @@ static int hci_dev_do_close(struct hci_dev *hdev)
 	atomic_set(&hdev->cmd_cnt, 1);
 	if (!test_bit(HCI_RAW, &hdev->flags)) {
 		set_bit(HCI_INIT, &hdev->flags);
-		__hci_request(hdev, hci_reset_req, 0, HZ/4);
+		__hci_request(hdev, hci_reset_req, 0,
+					msecs_to_jiffies(250));
 		clear_bit(HCI_INIT, &hdev->flags);
 	}
 
@@ -619,7 +621,8 @@ int hci_dev_reset(__u16 dev)
 	hdev->acl_cnt = 0; hdev->sco_cnt = 0;
 
 	if (!test_bit(HCI_RAW, &hdev->flags))
-		ret = __hci_request(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT);
+		ret = __hci_request(hdev, hci_reset_req, 0,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
 
 done:
 	tasklet_enable(&hdev->tx_task);
@@ -657,7 +660,8 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
 
 	switch (cmd) {
 	case HCISETAUTH:
-		err = hci_request(hdev, hci_auth_req, dr.dev_opt, HCI_INIT_TIMEOUT);
+		err = hci_request(hdev, hci_auth_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
 		break;
 
 	case HCISETENCRYPT:
@@ -668,18 +672,19 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
 
 		if (!test_bit(HCI_AUTH, &hdev->flags)) {
 			/* Auth must be enabled first */
-			err = hci_request(hdev, hci_auth_req,
-					dr.dev_opt, HCI_INIT_TIMEOUT);
+			err = hci_request(hdev, hci_auth_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
 			if (err)
 				break;
 		}
 
-		err = hci_request(hdev, hci_encrypt_req,
-					dr.dev_opt, HCI_INIT_TIMEOUT);
+		err = hci_request(hdev, hci_encrypt_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
 		break;
 
 	case HCISETSCAN:
-		err = hci_request(hdev, hci_scan_req, dr.dev_opt, HCI_INIT_TIMEOUT);
+		err = hci_request(hdev, hci_scan_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
 		break;
 
 	case HCISETPTYPE:
@@ -848,6 +853,10 @@ int hci_register_dev(struct hci_dev *hdev)
 	hdev->pkt_type  = (HCI_DM1 | HCI_DH1 | HCI_HV1);
 	hdev->link_mode = (HCI_LM_ACCEPT);
 
+	hdev->idle_timeout = 0;
+	hdev->sniff_max_interval = 800;
+	hdev->sniff_min_interval = 80;
+
 	tasklet_init(&hdev->cmd_task, hci_cmd_task,(unsigned long) hdev);
 	tasklet_init(&hdev->rx_task, hci_rx_task, (unsigned long) hdev);
 	tasklet_init(&hdev->tx_task, hci_tx_task, (unsigned long) hdev);
@@ -1220,6 +1229,9 @@ static inline void hci_sched_acl(struct hci_dev *hdev)
 	while (hdev->acl_cnt && (conn = hci_low_sent(hdev, ACL_LINK, &quote))) {
 		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
 			BT_DBG("skb %p len %d", skb, skb->len);
+
+			hci_conn_enter_active_mode(conn);
+
 			hci_send_frame(skb);
 			hdev->acl_last_tx = jiffies;
 
@@ -1298,6 +1310,8 @@ static inline void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 	if (conn) {
 		register struct hci_proto *hp;
 
+		hci_conn_enter_active_mode(conn);
+
 		/* Send to upper protocol */
 		if ((hp = hci_proto[HCI_PROTO_L2CAP]) && hp->recv_acldata) {
 			hp->recv_acldata(conn, skb, flags);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f41cf1a6c11a..3896dabab11d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -83,6 +83,8 @@ static void hci_cc_link_policy(struct hci_dev *hdev, __u16 ocf, struct sk_buff *
 {
 	struct hci_conn *conn;
 	struct hci_rp_role_discovery *rd;
+	struct hci_rp_write_link_policy *lp;
+	void *sent;
 
 	BT_DBG("%s ocf 0x%x", hdev->name, ocf);
 
@@ -106,6 +108,27 @@ static void hci_cc_link_policy(struct hci_dev *hdev, __u16 ocf, struct sk_buff *
 		hci_dev_unlock(hdev);
 		break;
 
+	case OCF_WRITE_LINK_POLICY:
+		sent = hci_sent_cmd_data(hdev, OGF_LINK_POLICY, OCF_WRITE_LINK_POLICY);
+		if (!sent)
+			break;
+
+		lp = (struct hci_rp_write_link_policy *) skb->data;
+
+		if (lp->status)
+			break;
+
+		hci_dev_lock(hdev);
+
+		conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(lp->handle));
+		if (conn) {
+			__le16 policy = get_unaligned((__le16 *) (sent + 2));
+			conn->link_policy = __le16_to_cpu(policy);
+		}
+
+		hci_dev_unlock(hdev);
+		break;
+
 	default:
 		BT_DBG("%s: Command complete: ogf LINK_POLICY ocf %x", 
 				hdev->name, ocf);
@@ -274,7 +297,7 @@ static void hci_cc_host_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb
 /* Command Complete OGF INFO_PARAM  */
 static void hci_cc_info_param(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb)
 {
-	struct hci_rp_read_loc_features *lf;
+	struct hci_rp_read_local_features *lf;
 	struct hci_rp_read_buffer_size *bs;
 	struct hci_rp_read_bd_addr *ba;
 
@@ -282,7 +305,7 @@ static void hci_cc_info_param(struct hci_dev *hdev, __u16 ocf, struct sk_buff *s
 
 	switch (ocf) {
 	case OCF_READ_LOCAL_FEATURES:
-		lf = (struct hci_rp_read_loc_features *) skb->data;
+		lf = (struct hci_rp_read_local_features *) skb->data;
 
 		if (lf->status) {
 			BT_DBG("%s READ_LOCAL_FEATURES failed %d", hdev->name, lf->status);
@@ -447,8 +470,46 @@ static void hci_cs_link_policy(struct hci_dev *hdev, __u16 ocf, __u8 status)
 	BT_DBG("%s ocf 0x%x", hdev->name, ocf);
 
 	switch (ocf) {
+	case OCF_SNIFF_MODE:
+		if (status) {
+			struct hci_conn *conn;
+			struct hci_cp_sniff_mode *cp = hci_sent_cmd_data(hdev, OGF_LINK_POLICY, OCF_SNIFF_MODE);
+
+			if (!cp)
+				break;
+
+			hci_dev_lock(hdev);
+
+			conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+			if (conn) {
+				clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
+			}
+
+			hci_dev_unlock(hdev);
+		}
+		break;
+
+	case OCF_EXIT_SNIFF_MODE:
+		if (status) {
+			struct hci_conn *conn;
+			struct hci_cp_exit_sniff_mode *cp = hci_sent_cmd_data(hdev, OGF_LINK_POLICY, OCF_EXIT_SNIFF_MODE);
+
+			if (!cp)
+				break;
+
+			hci_dev_lock(hdev);
+
+			conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+			if (conn) {
+				clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
+			}
+
+			hci_dev_unlock(hdev);
+		}
+		break;
+
 	default:
-		BT_DBG("%s Command status: ogf HOST_POLICY ocf %x", hdev->name, ocf);
+		BT_DBG("%s Command status: ogf LINK_POLICY ocf %x", hdev->name, ocf);
 		break;
 	}
 }
@@ -630,14 +691,16 @@ static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *sk
 		else
 			cp.role = 0x01; /* Remain slave */
 
-		hci_send_cmd(hdev, OGF_LINK_CTL, OCF_ACCEPT_CONN_REQ, sizeof(cp), &cp);
+		hci_send_cmd(hdev, OGF_LINK_CTL,
+				OCF_ACCEPT_CONN_REQ, sizeof(cp), &cp);
 	} else {
 		/* Connection rejected */
 		struct hci_cp_reject_conn_req cp;
 
 		bacpy(&cp.bdaddr, &ev->bdaddr);
 		cp.reason = 0x0f;
-		hci_send_cmd(hdev, OGF_LINK_CTL, OCF_REJECT_CONN_REQ, sizeof(cp), &cp);
+		hci_send_cmd(hdev, OGF_LINK_CTL,
+				OCF_REJECT_CONN_REQ, sizeof(cp), &cp);
 	}
 }
 
@@ -645,7 +708,7 @@ static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *sk
 static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_conn_complete *ev = (struct hci_ev_conn_complete *) skb->data;
-	struct hci_conn *conn = NULL;
+	struct hci_conn *conn;
 
 	BT_DBG("%s", hdev->name);
 
@@ -667,12 +730,21 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 		if (test_bit(HCI_ENCRYPT, &hdev->flags))
 			conn->link_mode |= HCI_LM_ENCRYPT;
 
+		/* Get remote features */
+		if (conn->type == ACL_LINK) {
+			struct hci_cp_read_remote_features cp;
+			cp.handle = ev->handle;
+			hci_send_cmd(hdev, OGF_LINK_CTL,
+				OCF_READ_REMOTE_FEATURES, sizeof(cp), &cp);
+		}
+
 		/* Set link policy */
 		if (conn->type == ACL_LINK && hdev->link_policy) {
 			struct hci_cp_write_link_policy cp;
 			cp.handle = ev->handle;
 			cp.policy = __cpu_to_le16(hdev->link_policy);
-			hci_send_cmd(hdev, OGF_LINK_POLICY, OCF_WRITE_LINK_POLICY, sizeof(cp), &cp);
+			hci_send_cmd(hdev, OGF_LINK_POLICY,
+				OCF_WRITE_LINK_POLICY, sizeof(cp), &cp);
 		}
 
 		/* Set packet type for incoming connection */
@@ -683,7 +755,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 				__cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK):
 				__cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK);
 
-			hci_send_cmd(hdev, OGF_LINK_CTL, OCF_CHANGE_CONN_PTYPE, sizeof(cp), &cp);
+			hci_send_cmd(hdev, OGF_LINK_CTL,
+				OCF_CHANGE_CONN_PTYPE, sizeof(cp), &cp);
 		}
 	} else
 		conn->state = BT_CLOSED;
@@ -711,8 +784,7 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_disconn_complete *ev = (struct hci_ev_disconn_complete *) skb->data;
-	struct hci_conn *conn = NULL;
-	__u16 handle = __le16_to_cpu(ev->handle);
+	struct hci_conn *conn;
 
 	BT_DBG("%s status %d", hdev->name, ev->status);
 
@@ -721,7 +793,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff
 
 	hci_dev_lock(hdev);
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
 	if (conn) {
 		conn->state = BT_CLOSED;
 		hci_proto_disconn_ind(conn, ev->reason);
@@ -778,7 +850,7 @@ static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *s
 static inline void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_role_change *ev = (struct hci_ev_role_change *) skb->data;
-	struct hci_conn *conn = NULL;
+	struct hci_conn *conn;
 
 	BT_DBG("%s status %d", hdev->name, ev->status);
 
@@ -801,18 +873,43 @@ static inline void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb
 	hci_dev_unlock(hdev);
 }
 
+/* Mode Change */
+static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_mode_change *ev = (struct hci_ev_mode_change *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn) {
+		conn->mode = ev->mode;
+		conn->interval = __le16_to_cpu(ev->interval);
+
+		if (!test_and_clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
+			if (conn->mode == HCI_CM_ACTIVE)
+				conn->power_save = 1;
+			else
+				conn->power_save = 0;
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
 /* Authentication Complete */
 static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_auth_complete *ev = (struct hci_ev_auth_complete *) skb->data;
-	struct hci_conn *conn = NULL;
-	__u16 handle = __le16_to_cpu(ev->handle);
+	struct hci_conn *conn;
 
 	BT_DBG("%s status %d", hdev->name, ev->status);
 
 	hci_dev_lock(hdev);
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
 	if (conn) {
 		if (!ev->status)
 			conn->link_mode |= HCI_LM_AUTH;
@@ -827,8 +924,7 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 				cp.handle  = __cpu_to_le16(conn->handle);
 				cp.encrypt = 1;
 				hci_send_cmd(conn->hdev, OGF_LINK_CTL,
-						OCF_SET_CONN_ENCRYPT,
-						sizeof(cp), &cp);
+					OCF_SET_CONN_ENCRYPT, sizeof(cp), &cp);
 			} else {
 				clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend);
 				hci_encrypt_cfm(conn, ev->status, 0x00);
@@ -843,14 +939,13 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_encrypt_change *ev = (struct hci_ev_encrypt_change *) skb->data;
-	struct hci_conn *conn = NULL;
-	__u16 handle = __le16_to_cpu(ev->handle);
+	struct hci_conn *conn;
 
 	BT_DBG("%s status %d", hdev->name, ev->status);
 
 	hci_dev_lock(hdev);
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
 	if (conn) {
 		if (!ev->status) {
 			if (ev->encrypt)
@@ -871,14 +966,13 @@ static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *
 static inline void hci_change_conn_link_key_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_change_conn_link_key_complete *ev = (struct hci_ev_change_conn_link_key_complete *) skb->data;
-	struct hci_conn *conn = NULL;
-	__u16 handle = __le16_to_cpu(ev->handle);
+	struct hci_conn *conn;
 
 	BT_DBG("%s status %d", hdev->name, ev->status);
 
 	hci_dev_lock(hdev);
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
 	if (conn) {
 		if (!ev->status)
 			conn->link_mode |= HCI_LM_SECURE;
@@ -906,18 +1000,35 @@ static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff
 {
 }
 
+/* Remote Features */
+static inline void hci_remote_features_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_remote_features *ev = (struct hci_ev_remote_features *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn && !ev->status) {
+		memcpy(conn->features, ev->features, sizeof(conn->features));
+	}
+
+	hci_dev_unlock(hdev);
+}
+
 /* Clock Offset */
 static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_ev_clock_offset *ev = (struct hci_ev_clock_offset *) skb->data;
-	struct hci_conn *conn = NULL;
-	__u16 handle = __le16_to_cpu(ev->handle);
+	struct hci_conn *conn;
 
 	BT_DBG("%s status %d", hdev->name, ev->status);
 
 	hci_dev_lock(hdev);
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
 	if (conn && !ev->status) {
 		struct inquiry_entry *ie;
 
@@ -948,6 +1059,23 @@ static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *
 	hci_dev_unlock(hdev);
 }
 
+/* Sniff Subrate */
+static inline void hci_sniff_subrate_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_sniff_subrate *ev = (struct hci_ev_sniff_subrate *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn) {
+	}
+
+	hci_dev_unlock(hdev);
+}
+
 void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data;
@@ -996,6 +1124,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		hci_role_change_evt(hdev, skb);
 		break;
 
+	case HCI_EV_MODE_CHANGE:
+		hci_mode_change_evt(hdev, skb);
+		break;
+
 	case HCI_EV_AUTH_COMPLETE:
 		hci_auth_complete_evt(hdev, skb);
 		break;
@@ -1020,6 +1152,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		hci_link_key_notify_evt(hdev, skb);
 		break;
 
+	case HCI_EV_REMOTE_FEATURES:
+		hci_remote_features_evt(hdev, skb);
+		break;
+
 	case HCI_EV_CLOCK_OFFSET:
 		hci_clock_offset_evt(hdev, skb);
 		break;
@@ -1028,6 +1164,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		hci_pscan_rep_mode_evt(hdev, skb);
 		break;
 
+	case HCI_EV_SNIFF_SUBRATE:
+		hci_sniff_subrate_evt(hdev, skb);
+		break;
+
 	case HCI_EV_CMD_STATUS:
 		cs = (struct hci_ev_cmd_status *) skb->data;
 		skb_pull(skb, sizeof(cs));
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 19b234c86f33..89918d2f1fdc 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -61,18 +61,106 @@ static ssize_t show_inquiry_cache(struct class_device *cdev, char *buf)
 	return n;
 }
 
+static ssize_t show_idle_timeout(struct class_device *cdev, char *buf)
+{
+	struct hci_dev *hdev = class_get_devdata(cdev);
+	return sprintf(buf, "%d\n", hdev->idle_timeout);
+}
+
+static ssize_t store_idle_timeout(struct class_device *cdev, const char *buf, size_t count)
+{
+	struct hci_dev *hdev = class_get_devdata(cdev);
+	char *ptr;
+	__u32 val;
+
+	val = simple_strtoul(buf, &ptr, 10);
+	if (ptr == buf)
+		return -EINVAL;
+
+	if (val != 0 && (val < 500 || val > 3600000))
+		return -EINVAL;
+
+	hdev->idle_timeout = val;
+
+	return count;
+}
+
+static ssize_t show_sniff_max_interval(struct class_device *cdev, char *buf)
+{
+	struct hci_dev *hdev = class_get_devdata(cdev);
+	return sprintf(buf, "%d\n", hdev->sniff_max_interval);
+}
+
+static ssize_t store_sniff_max_interval(struct class_device *cdev, const char *buf, size_t count)
+{
+	struct hci_dev *hdev = class_get_devdata(cdev);
+	char *ptr;
+	__u16 val;
+
+	val = simple_strtoul(buf, &ptr, 10);
+	if (ptr == buf)
+		return -EINVAL;
+
+	if (val < 0x0002 || val > 0xFFFE || val % 2)
+		return -EINVAL;
+
+	if (val < hdev->sniff_min_interval)
+		return -EINVAL;
+
+	hdev->sniff_max_interval = val;
+
+	return count;
+}
+
+static ssize_t show_sniff_min_interval(struct class_device *cdev, char *buf)
+{
+	struct hci_dev *hdev = class_get_devdata(cdev);
+	return sprintf(buf, "%d\n", hdev->sniff_min_interval);
+}
+
+static ssize_t store_sniff_min_interval(struct class_device *cdev, const char *buf, size_t count)
+{
+	struct hci_dev *hdev = class_get_devdata(cdev);
+	char *ptr;
+	__u16 val;
+
+	val = simple_strtoul(buf, &ptr, 10);
+	if (ptr == buf)
+		return -EINVAL;
+
+	if (val < 0x0002 || val > 0xFFFE || val % 2)
+		return -EINVAL;
+
+	if (val > hdev->sniff_max_interval)
+		return -EINVAL;
+
+	hdev->sniff_min_interval = val;
+
+	return count;
+}
+
 static CLASS_DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
 static CLASS_DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
 static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
 static CLASS_DEVICE_ATTR(flags, S_IRUGO, show_flags, NULL);
 static CLASS_DEVICE_ATTR(inquiry_cache, S_IRUGO, show_inquiry_cache, NULL);
 
+static CLASS_DEVICE_ATTR(idle_timeout, S_IRUGO | S_IWUSR,
+				show_idle_timeout, store_idle_timeout);
+static CLASS_DEVICE_ATTR(sniff_max_interval, S_IRUGO | S_IWUSR,
+				show_sniff_max_interval, store_sniff_max_interval);
+static CLASS_DEVICE_ATTR(sniff_min_interval, S_IRUGO | S_IWUSR,
+				show_sniff_min_interval, store_sniff_min_interval);
+
 static struct class_device_attribute *bt_attrs[] = {
 	&class_device_attr_name,
 	&class_device_attr_type,
 	&class_device_attr_address,
 	&class_device_attr_flags,
 	&class_device_attr_inquiry_cache,
+	&class_device_attr_idle_timeout,
+	&class_device_attr_sniff_max_interval,
+	&class_device_attr_sniff_min_interval,
 	NULL
 };
 
-- 
cgit v1.2.3


From 27d35284258c596900e0e41c46932ec4abe6a7b1 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 3 Jul 2006 10:02:37 +0200
Subject: [Bluetooth] Add platform device for virtual and serial devices

This patch adds a generic Bluetooth platform device that can be used
as parent device by virtual and serial devices.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/bluecard_cs.c  |  1 +
 drivers/bluetooth/bt3c_cs.c      |  1 +
 drivers/bluetooth/btuart_cs.c    |  1 +
 drivers/bluetooth/dtl1_cs.c      |  1 +
 drivers/bluetooth/hci_vhci.c     |  1 -
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/af_bluetooth.c     | 20 ++++++++++++++------
 net/bluetooth/hci_sysfs.c        | 39 +++++++++++++++++++++++++++++++++++++--
 8 files changed, 56 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c
index 2830f58d6f77..8eebf9ca3786 100644
--- a/drivers/bluetooth/bluecard_cs.c
+++ b/drivers/bluetooth/bluecard_cs.c
@@ -739,6 +739,7 @@ static int bluecard_open(bluecard_info_t *info)
 
 	hdev->type = HCI_PCCARD;
 	hdev->driver_data = info;
+	SET_HCIDEV_DEV(hdev, &info->p_dev->dev);
 
 	hdev->open     = bluecard_hci_open;
 	hdev->close    = bluecard_hci_close;
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index c9dba5565cac..df7bb016df49 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -582,6 +582,7 @@ static int bt3c_open(bt3c_info_t *info)
 
 	hdev->type = HCI_PCCARD;
 	hdev->driver_data = info;
+	SET_HCIDEV_DEV(hdev, &info->p_dev->dev);
 
 	hdev->open     = bt3c_hci_open;
 	hdev->close    = bt3c_hci_close;
diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c
index c889bf8109a1..746ccca97f6f 100644
--- a/drivers/bluetooth/btuart_cs.c
+++ b/drivers/bluetooth/btuart_cs.c
@@ -502,6 +502,7 @@ static int btuart_open(btuart_info_t *info)
 
 	hdev->type = HCI_PCCARD;
 	hdev->driver_data = info;
+	SET_HCIDEV_DEV(hdev, &info->p_dev->dev);
 
 	hdev->open     = btuart_hci_open;
 	hdev->close    = btuart_hci_close;
diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c
index be6eed175aa3..0e99def8a1e3 100644
--- a/drivers/bluetooth/dtl1_cs.c
+++ b/drivers/bluetooth/dtl1_cs.c
@@ -484,6 +484,7 @@ static int dtl1_open(dtl1_info_t *info)
 
 	hdev->type = HCI_PCCARD;
 	hdev->driver_data = info;
+	SET_HCIDEV_DEV(hdev, &info->p_dev->dev);
 
 	hdev->open     = dtl1_hci_open;
 	hdev->close    = dtl1_hci_close;
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index ea589007fa26..aac67a3a6019 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -277,7 +277,6 @@ static int vhci_open(struct inode *inode, struct file *file)
 
 	hdev->type = HCI_VHCI;
 	hdev->driver_data = vhci;
-	SET_HCIDEV_DEV(hdev, vhci_miscdev.dev);
 
 	hdev->open     = vhci_open_dev;
 	hdev->close    = vhci_close_dev;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f6852707bd64..ae67b36d76dc 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -124,6 +124,7 @@ struct hci_dev {
 
 	atomic_t 		promisc;
 
+	struct device		*dev;
 	struct class_device	class_dev;
 
 	struct module 		*owner;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 729461fcfe99..788ea7a2b744 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -48,7 +48,7 @@
 #define BT_DBG(D...)
 #endif
 
-#define VERSION "2.9"
+#define VERSION "2.10"
 
 /* Bluetooth sockets */
 #define BT_MAX_PROTO	8
@@ -307,13 +307,21 @@ static struct net_proto_family bt_sock_family_ops = {
 
 static int __init bt_init(void)
 {
+	int err;
+
 	BT_INFO("Core ver %s", VERSION);
 
-	sock_register(&bt_sock_family_ops);
+	err = bt_sysfs_init();
+	if (err < 0)
+		return err;
 
-	BT_INFO("HCI device and connection manager initialized");
+	err = sock_register(&bt_sock_family_ops);
+	if (err < 0) {
+		bt_sysfs_cleanup();
+		return err;
+	}
 
-	bt_sysfs_init();
+	BT_INFO("HCI device and connection manager initialized");
 
 	hci_sock_init();
 
@@ -324,9 +332,9 @@ static void __exit bt_exit(void)
 {
 	hci_sock_cleanup();
 
-	bt_sysfs_cleanup();
-
 	sock_unregister(PF_BLUETOOTH);
+
+	bt_sysfs_cleanup();
 }
 
 subsys_initcall(bt_init);
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 89918d2f1fdc..7789e26c84b9 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -3,6 +3,8 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 
+#include <linux/platform_device.h>
+
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
 
@@ -197,9 +199,14 @@ struct class bt_class = {
 	.uevent		= bt_uevent,
 #endif
 };
-
 EXPORT_SYMBOL_GPL(bt_class);
 
+static struct bus_type bt_bus = {
+	.name	= "bluetooth",
+};
+
+static struct platform_device *bt_platform;
+
 int hci_register_sysfs(struct hci_dev *hdev)
 {
 	struct class_device *cdev = &hdev->class_dev;
@@ -211,6 +218,11 @@ int hci_register_sysfs(struct hci_dev *hdev)
 	cdev->class = &bt_class;
 	class_set_devdata(cdev, hdev);
 
+	if (!cdev->dev)
+		cdev->dev = &bt_platform->dev;
+
+	hdev->dev = cdev->dev;
+
 	strlcpy(cdev->class_id, hdev->name, BUS_ID_SIZE);
 	err = class_device_register(cdev);
 	if (err < 0)
@@ -233,10 +245,33 @@ void hci_unregister_sysfs(struct hci_dev *hdev)
 
 int __init bt_sysfs_init(void)
 {
-	return class_register(&bt_class);
+	int err;
+
+	bt_platform = platform_device_register_simple("bluetooth", -1, NULL, 0);
+	if (IS_ERR(bt_platform))
+		return PTR_ERR(bt_platform);
+
+	err = bus_register(&bt_bus);
+	if (err < 0) {
+		platform_device_unregister(bt_platform);
+		return err;
+	}
+
+	err = class_register(&bt_class);
+	if (err < 0) {
+		bus_unregister(&bt_bus);
+		platform_device_unregister(bt_platform);
+		return err;
+	}
+
+	return 0;
 }
 
 void __exit bt_sysfs_cleanup(void)
 {
 	class_unregister(&bt_class);
+
+	bus_unregister(&bt_bus);
+
+	platform_device_unregister(bt_platform);
 }
-- 
cgit v1.2.3


From a91f2e396f5b32b21d842b4757bc8de5e88eac66 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 3 Jul 2006 10:02:41 +0200
Subject: [Bluetooth] Use real devices for host controllers

This patch converts the Bluetooth class devices into real devices. The
Bluetooth class is kept and the driver core provides the appropriate
symlinks for backward compatibility.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/bluetooth.h |   2 +-
 include/net/bluetooth/hci_core.h  |   6 +-
 net/bluetooth/hci_core.c          |   4 +-
 net/bluetooth/hci_sysfs.c         | 152 ++++++++++++++++----------------------
 net/bluetooth/l2cap.c             |   4 +-
 net/bluetooth/rfcomm/core.c       |   4 +-
 net/bluetooth/rfcomm/sock.c       |   4 +-
 net/bluetooth/sco.c               |   4 +-
 8 files changed, 79 insertions(+), 101 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 911ceb5cd263..771d17783c18 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -175,6 +175,6 @@ extern int hci_sock_cleanup(void);
 extern int bt_sysfs_init(void);
 extern void bt_sysfs_cleanup(void);
 
-extern struct class bt_class;
+extern struct class *bt_class;
 
 #endif /* __BLUETOOTH_H */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index ae67b36d76dc..d84855fe7336 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -124,8 +124,8 @@ struct hci_dev {
 
 	atomic_t 		promisc;
 
-	struct device		*dev;
-	struct class_device	class_dev;
+	struct device		*parent;
+	struct device		dev;
 
 	struct module 		*owner;
 
@@ -413,7 +413,7 @@ static inline int hci_recv_frame(struct sk_buff *skb)
 int hci_register_sysfs(struct hci_dev *hdev);
 void hci_unregister_sysfs(struct hci_dev *hdev);
 
-#define SET_HCIDEV_DEV(hdev, pdev) ((hdev)->class_dev.dev = (pdev))
+#define SET_HCIDEV_DEV(hdev, pdev) ((hdev)->parent = (pdev))
 
 /* ----- LMP capabilities ----- */
 #define lmp_rswitch_capable(dev)   ((dev)->features[0] & LMP_RSWITCH)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 3f9f1565bf06..54e8e5ea2154 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -817,8 +817,8 @@ void hci_free_dev(struct hci_dev *hdev)
 {
 	skb_queue_purge(&hdev->driver_init);
 
-	/* will free via class release */
-	class_device_put(&hdev->class_dev);
+	/* will free via device release */
+	put_device(&hdev->dev);
 }
 EXPORT_SYMBOL(hci_free_dev);
 
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 7789e26c84b9..3987d167f04e 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -13,35 +13,35 @@
 #define BT_DBG(D...)
 #endif
 
-static ssize_t show_name(struct class_device *cdev, char *buf)
+static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	return sprintf(buf, "%s\n", hdev->name);
 }
 
-static ssize_t show_type(struct class_device *cdev, char *buf)
+static ssize_t show_type(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	return sprintf(buf, "%d\n", hdev->type);
 }
 
-static ssize_t show_address(struct class_device *cdev, char *buf)
+static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	bdaddr_t bdaddr;
 	baswap(&bdaddr, &hdev->bdaddr);
 	return sprintf(buf, "%s\n", batostr(&bdaddr));
 }
 
-static ssize_t show_flags(struct class_device *cdev, char *buf)
+static ssize_t show_flags(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	return sprintf(buf, "0x%lx\n", hdev->flags);
 }
 
-static ssize_t show_inquiry_cache(struct class_device *cdev, char *buf)
+static ssize_t show_inquiry_cache(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	struct inquiry_cache *cache = &hdev->inq_cache;
 	struct inquiry_entry *e;
 	int n = 0;
@@ -63,15 +63,15 @@ static ssize_t show_inquiry_cache(struct class_device *cdev, char *buf)
 	return n;
 }
 
-static ssize_t show_idle_timeout(struct class_device *cdev, char *buf)
+static ssize_t show_idle_timeout(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	return sprintf(buf, "%d\n", hdev->idle_timeout);
 }
 
-static ssize_t store_idle_timeout(struct class_device *cdev, const char *buf, size_t count)
+static ssize_t store_idle_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	char *ptr;
 	__u32 val;
 
@@ -87,15 +87,15 @@ static ssize_t store_idle_timeout(struct class_device *cdev, const char *buf, si
 	return count;
 }
 
-static ssize_t show_sniff_max_interval(struct class_device *cdev, char *buf)
+static ssize_t show_sniff_max_interval(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	return sprintf(buf, "%d\n", hdev->sniff_max_interval);
 }
 
-static ssize_t store_sniff_max_interval(struct class_device *cdev, const char *buf, size_t count)
+static ssize_t store_sniff_max_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	char *ptr;
 	__u16 val;
 
@@ -114,15 +114,15 @@ static ssize_t store_sniff_max_interval(struct class_device *cdev, const char *b
 	return count;
 }
 
-static ssize_t show_sniff_min_interval(struct class_device *cdev, char *buf)
+static ssize_t show_sniff_min_interval(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	return sprintf(buf, "%d\n", hdev->sniff_min_interval);
 }
 
-static ssize_t store_sniff_min_interval(struct class_device *cdev, const char *buf, size_t count)
+static ssize_t store_sniff_min_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
 {
-	struct hci_dev *hdev = class_get_devdata(cdev);
+	struct hci_dev *hdev = dev_get_drvdata(dev);
 	char *ptr;
 	__u16 val;
 
@@ -141,64 +141,32 @@ static ssize_t store_sniff_min_interval(struct class_device *cdev, const char *b
 	return count;
 }
 
-static CLASS_DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
-static CLASS_DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
-static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
-static CLASS_DEVICE_ATTR(flags, S_IRUGO, show_flags, NULL);
-static CLASS_DEVICE_ATTR(inquiry_cache, S_IRUGO, show_inquiry_cache, NULL);
+static DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
+static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
+static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
+static DEVICE_ATTR(flags, S_IRUGO, show_flags, NULL);
+static DEVICE_ATTR(inquiry_cache, S_IRUGO, show_inquiry_cache, NULL);
 
-static CLASS_DEVICE_ATTR(idle_timeout, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(idle_timeout, S_IRUGO | S_IWUSR,
 				show_idle_timeout, store_idle_timeout);
-static CLASS_DEVICE_ATTR(sniff_max_interval, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(sniff_max_interval, S_IRUGO | S_IWUSR,
 				show_sniff_max_interval, store_sniff_max_interval);
-static CLASS_DEVICE_ATTR(sniff_min_interval, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(sniff_min_interval, S_IRUGO | S_IWUSR,
 				show_sniff_min_interval, store_sniff_min_interval);
 
-static struct class_device_attribute *bt_attrs[] = {
-	&class_device_attr_name,
-	&class_device_attr_type,
-	&class_device_attr_address,
-	&class_device_attr_flags,
-	&class_device_attr_inquiry_cache,
-	&class_device_attr_idle_timeout,
-	&class_device_attr_sniff_max_interval,
-	&class_device_attr_sniff_min_interval,
+static struct device_attribute *bt_attrs[] = {
+	&dev_attr_name,
+	&dev_attr_type,
+	&dev_attr_address,
+	&dev_attr_flags,
+	&dev_attr_inquiry_cache,
+	&dev_attr_idle_timeout,
+	&dev_attr_sniff_max_interval,
+	&dev_attr_sniff_min_interval,
 	NULL
 };
 
-#ifdef CONFIG_HOTPLUG
-static int bt_uevent(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
-{
-	struct hci_dev *hdev = class_get_devdata(cdev);
-	int n, i = 0;
-
-	envp[i++] = buf;
-	n = snprintf(buf, size, "INTERFACE=%s", hdev->name) + 1;
-	buf += n;
-	size -= n;
-
-	if ((size <= 0) || (i >= num_envp))
-		return -ENOMEM;
-
-	envp[i] = NULL;
-	return 0;
-}
-#endif
-
-static void bt_release(struct class_device *cdev)
-{
-	struct hci_dev *hdev = class_get_devdata(cdev);
-
-	kfree(hdev);
-}
-
-struct class bt_class = {
-	.name		= "bluetooth",
-	.release	= bt_release,
-#ifdef CONFIG_HOTPLUG
-	.uevent		= bt_uevent,
-#endif
-};
+struct class *bt_class = NULL;
 EXPORT_SYMBOL_GPL(bt_class);
 
 static struct bus_type bt_bus = {
@@ -207,40 +175,50 @@ static struct bus_type bt_bus = {
 
 static struct platform_device *bt_platform;
 
+static void bt_release(struct device *dev)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	kfree(hdev);
+}
+
 int hci_register_sysfs(struct hci_dev *hdev)
 {
-	struct class_device *cdev = &hdev->class_dev;
+	struct device *dev = &hdev->dev;
 	unsigned int i;
 	int err;
 
 	BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type);
 
-	cdev->class = &bt_class;
-	class_set_devdata(cdev, hdev);
+	dev->class = bt_class;
+
+	if (hdev->parent)
+		dev->parent = hdev->parent;
+	else
+		dev->parent = &bt_platform->dev;
+
+	strlcpy(dev->bus_id, hdev->name, BUS_ID_SIZE);
 
-	if (!cdev->dev)
-		cdev->dev = &bt_platform->dev;
+	dev->release = bt_release;
 
-	hdev->dev = cdev->dev;
+	dev_set_drvdata(dev, hdev);
 
-	strlcpy(cdev->class_id, hdev->name, BUS_ID_SIZE);
-	err = class_device_register(cdev);
+	err = device_register(dev);
 	if (err < 0)
 		return err;
 
 	for (i = 0; bt_attrs[i]; i++)
-		class_device_create_file(cdev, bt_attrs[i]);
+		device_create_file(dev, bt_attrs[i]);
 
 	return 0;
 }
 
 void hci_unregister_sysfs(struct hci_dev *hdev)
 {
-	struct class_device * cdev = &hdev->class_dev;
+	struct device *dev = &hdev->dev;
 
 	BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type);
 
-	class_device_del(cdev);
+	device_del(dev);
 }
 
 int __init bt_sysfs_init(void)
@@ -257,11 +235,11 @@ int __init bt_sysfs_init(void)
 		return err;
 	}
 
-	err = class_register(&bt_class);
-	if (err < 0) {
+	bt_class = class_create(THIS_MODULE, "bluetooth");
+	if (IS_ERR(bt_class)) {
 		bus_unregister(&bt_bus);
 		platform_device_unregister(bt_platform);
-		return err;
+		return PTR_ERR(bt_class);
 	}
 
 	return 0;
@@ -269,7 +247,7 @@ int __init bt_sysfs_init(void)
 
 void __exit bt_sysfs_cleanup(void)
 {
-	class_unregister(&bt_class);
+	class_destroy(bt_class);
 
 	bus_unregister(&bt_bus);
 
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 770101177da1..f6501fa0d59f 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -2219,7 +2219,7 @@ static int __init l2cap_init(void)
 		goto error;
 	}
 
-	class_create_file(&bt_class, &class_attr_l2cap);
+	class_create_file(bt_class, &class_attr_l2cap);
 
 	BT_INFO("L2CAP ver %s", VERSION);
 	BT_INFO("L2CAP socket layer initialized");
@@ -2233,7 +2233,7 @@ error:
 
 static void __exit l2cap_exit(void)
 {
-	class_remove_file(&bt_class, &class_attr_l2cap);
+	class_remove_file(bt_class, &class_attr_l2cap);
 
 	if (bt_sock_unregister(BTPROTO_L2CAP) < 0)
 		BT_ERR("L2CAP socket unregistration failed");
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index bd46e8927f29..6de33fe7bf5b 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -2035,7 +2035,7 @@ static int __init rfcomm_init(void)
 
 	kernel_thread(rfcomm_run, NULL, CLONE_KERNEL);
 
-	class_create_file(&bt_class, &class_attr_rfcomm_dlc);
+	class_create_file(bt_class, &class_attr_rfcomm_dlc);
 
 	rfcomm_init_sockets();
 
@@ -2050,7 +2050,7 @@ static int __init rfcomm_init(void)
 
 static void __exit rfcomm_exit(void)
 {
-	class_remove_file(&bt_class, &class_attr_rfcomm_dlc);
+	class_remove_file(bt_class, &class_attr_rfcomm_dlc);
 
 	hci_unregister_cb(&rfcomm_cb);
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 4e9962c8cfa6..220fee04e7f2 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -944,7 +944,7 @@ int __init rfcomm_init_sockets(void)
 	if (err < 0)
 		goto error;
 
-	class_create_file(&bt_class, &class_attr_rfcomm);
+	class_create_file(bt_class, &class_attr_rfcomm);
 
 	BT_INFO("RFCOMM socket layer initialized");
 
@@ -958,7 +958,7 @@ error:
 
 void __exit rfcomm_cleanup_sockets(void)
 {
-	class_remove_file(&bt_class, &class_attr_rfcomm);
+	class_remove_file(bt_class, &class_attr_rfcomm);
 
 	if (bt_sock_unregister(BTPROTO_RFCOMM) < 0)
 		BT_ERR("RFCOMM socket layer unregistration failed");
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index a5f1e44db5d3..85defccc0287 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -969,7 +969,7 @@ static int __init sco_init(void)
 		goto error;
 	}
 
-	class_create_file(&bt_class, &class_attr_sco);
+	class_create_file(bt_class, &class_attr_sco);
 
 	BT_INFO("SCO (Voice Link) ver %s", VERSION);
 	BT_INFO("SCO socket layer initialized");
@@ -983,7 +983,7 @@ error:
 
 static void __exit sco_exit(void)
 {
-	class_remove_file(&bt_class, &class_attr_sco);
+	class_remove_file(bt_class, &class_attr_sco);
 
 	if (bt_sock_unregister(BTPROTO_SCO) < 0)
 		BT_ERR("SCO socket unregistration failed");
-- 
cgit v1.2.3


From 7ad7153b051d9628ecd6a336b543ea6ef099bd2c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 4 Jul 2006 14:00:06 -0700
Subject: Fix up headers_install wrt devfs removal

No devfs_fs.h header any more..

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/Kbuild | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index f7252be57702..2b8a7d68fae3 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -8,7 +8,7 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h	\
 	atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h	\
 	awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h		\
 	bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h		\
-	consolemap.h cycx_cfm.h devfs_fs.h dm-ioctl.h dn.h dqblk_v1.h	\
+	consolemap.h cycx_cfm.h dm-ioctl.h dn.h dqblk_v1.h		\
 	dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h	\
 	fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h	\
 	fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h	\
-- 
cgit v1.2.3