12 files changed, 363 insertions, 221 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index ccef6ae747a3..b78d4170fce2 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
 lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 1f81b79b796c..9a10a78bb4a4 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -1,10 +1,22 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
 /*
  * Zero a page. 	
  * rdi	page
  */			
-	.globl clear_page
-	.p2align 4
-clear_page:
+	ALIGN
+clear_page_c:
+	CFI_STARTPROC
+	movl $4096/8,%ecx
+	xorl %eax,%eax
+	rep stosq
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page)
+
+ENTRY(clear_page)
+	CFI_STARTPROC
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
 	.p2align 4
@@ -23,28 +35,25 @@ clear_page:
 	jnz	.Lloop
 	nop
 	ret
-clear_page_end:
+	CFI_ENDPROC
+.Lclear_page_end:
+ENDPROC(clear_page)
 
 	/* Some CPUs run faster using the string instructions.
 	   It is also a lot simpler. Use this when possible */
 
 #include <asm/cpufeature.h>
 
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb					/* jmp <disp8> */
+	.byte (clear_page_c - clear_page) - (2f - 1b)	/* offset */
+2:
+	.previous
 	.section .altinstructions,"a"
 	.align 8
-	.quad  clear_page
-	.quad  clear_page_c
-	.byte  X86_FEATURE_REP_GOOD
-	.byte  clear_page_end-clear_page
-	.byte  clear_page_c_end-clear_page_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
-clear_page_c:
-	movl $4096/8,%ecx
-	xorl %eax,%eax
-	rep 
-	stosq
-	ret
-clear_page_c_end:
+	.quad clear_page
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lclear_page_end - clear_page
+	.byte 2b - 1b
 	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 8fa19d96a7ee..0ebb03b60e79 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -1,17 +1,33 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 	
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+	ALIGN
+copy_page_c:
+	CFI_STARTPROC
+	movl $4096/8,%ecx
+	rep movsq
+	ret
+	CFI_ENDPROC
+ENDPROC(copy_page_c)
+
 /* Don't use streaming store because it's better when the target
    ends up in cache. */
 	    
 /* Could vary the prefetch distance based on SMP/UP */
 
-	.globl copy_page
-	.p2align 4
-copy_page:
+ENTRY(copy_page)
+	CFI_STARTPROC
 	subq	$3*8,%rsp
+	CFI_ADJUST_CFA_OFFSET 3*8
 	movq	%rbx,(%rsp)
+	CFI_REL_OFFSET rbx, 0
 	movq	%r12,1*8(%rsp)
+	CFI_REL_OFFSET r12, 1*8
 	movq	%r13,2*8(%rsp)
+	CFI_REL_OFFSET r13, 2*8
 
 	movl	$(4096/64)-5,%ecx
 	.p2align 4
@@ -72,30 +88,33 @@ copy_page:
 	jnz	.Loop2
 
 	movq	(%rsp),%rbx
+	CFI_RESTORE rbx
 	movq	1*8(%rsp),%r12
+	CFI_RESTORE r12
 	movq	2*8(%rsp),%r13
+	CFI_RESTORE r13
 	addq	$3*8,%rsp
+	CFI_ADJUST_CFA_OFFSET -3*8
 	ret
+.Lcopy_page_end:
+	CFI_ENDPROC
+ENDPROC(copy_page)
 
 	/* Some CPUs run faster using the string copy instructions.
 	   It is also a lot simpler. Use this when possible */
 
 #include <asm/cpufeature.h>
 
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb					/* jmp <disp8> */
+	.byte (copy_page_c - copy_page) - (2f - 1b)	/* offset */
+2:
+	.previous
 	.section .altinstructions,"a"
 	.align 8
-	.quad  copy_page
-	.quad  copy_page_c
-	.byte  X86_FEATURE_REP_GOOD
-	.byte  copy_page_c_end-copy_page_c
-	.byte  copy_page_c_end-copy_page_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
-copy_page_c:
-	movl $4096/8,%ecx
-	rep 
-	movsq 
-	ret
-copy_page_c_end:
+	.quad copy_page
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lcopy_page_end - copy_page
+	.byte 2b - 1b
 	.previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index f64569b83b54..70bebd310408 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -4,56 +4,78 @@
  * Functions to copy from and to user space.		
  */		 
 
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
 #define FIX_ALIGNMENT 1
 
-	#include <asm/current.h>
-	#include <asm/asm-offsets.h>
-	#include <asm/thread_info.h>
-	#include <asm/cpufeature.h>
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
 
-/* Standard copy_to_user with segment limit checking */		
-	.globl copy_to_user
-	.p2align 4	
-copy_to_user:
-	GET_THREAD_INFO(%rax)
-	movq %rdi,%rcx
-	addq %rdx,%rcx
-	jc  bad_to_user
-	cmpq threadinfo_addr_limit(%rax),%rcx
-	jae bad_to_user
-2:
+	.macro ALTERNATIVE_JUMP feature,orig,alt
+0:
 	.byte 0xe9	/* 32bit jump */
-	.long .Lcug-1f
+	.long \orig-1f	/* by default jump to orig */
 1:
-
 	.section .altinstr_replacement,"ax"
-3:	.byte 0xe9			/* replacement jmp with 8 bit immediate */
-	.long copy_user_generic_c-1b	/* offset */
+2:	.byte 0xe9	             /* near jump with 32bit immediate */
+	.long \alt-1b /* offset */   /* or alternatively to alt */
 	.previous
 	.section .altinstructions,"a"
 	.align 8
+	.quad  0b
 	.quad  2b
-	.quad  3b
-	.byte  X86_FEATURE_REP_GOOD
+	.byte  \feature		     /* when feature is set */
 	.byte  5
 	.byte  5
 	.previous
+	.endm
+
+/* Standard copy_to_user with segment limit checking */		
+ENTRY(copy_to_user)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%rax)
+	movq %rdi,%rcx
+	addq %rdx,%rcx
+	jc  bad_to_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae bad_to_user
+	xorl %eax,%eax	/* clear zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
+
+ENTRY(copy_user_generic)
+	CFI_STARTPROC
+	movl $1,%ecx	/* set zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
+
+ENTRY(__copy_from_user_inatomic)
+	CFI_STARTPROC
+	xorl %ecx,%ecx	/* clear zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
 
 /* Standard copy_from_user with segment limit checking */	
-	.globl copy_from_user
-	.p2align 4	
-copy_from_user:
+ENTRY(copy_from_user)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rsi,%rcx
 	addq %rdx,%rcx
 	jc  bad_from_user
 	cmpq threadinfo_addr_limit(%rax),%rcx
 	jae  bad_from_user
-	/* FALL THROUGH to copy_user_generic */
+	movl $1,%ecx	/* set zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
+ENDPROC(copy_from_user)
 	
 	.section .fixup,"ax"
 	/* must zero dest */
 bad_from_user:
+	CFI_STARTPROC
 	movl %edx,%ecx
 	xorl %eax,%eax
 	rep
@@ -61,40 +83,32 @@ bad_from_user:
 bad_to_user:
 	movl	%edx,%eax
 	ret
+	CFI_ENDPROC
+END(bad_from_user)
 	.previous
 	
 		
 /*
- * copy_user_generic - memory copy with exception handling.
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
  * 	
  * Input:	
  * rdi destination
  * rsi source
  * rdx count
+ * ecx zero flag -- if true zero destination on error
  *
  * Output:		
  * eax uncopied bytes or 0 if successful.
  */
-	.globl copy_user_generic
-	.p2align 4
-copy_user_generic:
-	.byte 0x66,0x66,0x90	/* 5 byte nop for replacement jump */
-	.byte 0x66,0x90
-1:
-	.section .altinstr_replacement,"ax"
-2:	.byte 0xe9	             /* near jump with 32bit immediate */
-	.long copy_user_generic_c-1b /* offset */
-	.previous
-	.section .altinstructions,"a"
-	.align 8
-	.quad  copy_user_generic
-	.quad  2b
-	.byte  X86_FEATURE_REP_GOOD
-	.byte  5
-	.byte  5
-	.previous
-.Lcug:
+ENTRY(copy_user_generic_unrolled)
+	CFI_STARTPROC
 	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
+	pushq %rcx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rcx, 0
 	xorl %eax,%eax		/*zero for the exception handler */
 
 #ifdef FIX_ALIGNMENT
@@ -168,9 +182,16 @@ copy_user_generic:
 	decl %ecx
 	jnz .Lloop_1
 
+	CFI_REMEMBER_STATE
 .Lende:
+	popq %rcx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rcx
 	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
 	ret
+	CFI_RESTORE_STATE
 
 #ifdef FIX_ALIGNMENT
 	/* align destination */
@@ -252,6 +273,8 @@ copy_user_generic:
 	addl %ecx,%edx
 	/* edx: bytes to zero, rdi: dest, eax:zero */
 .Lzero_rest:
+	cmpl $0,(%rsp)
+	jz   .Le_zero
 	movq %rdx,%rcx
 .Le_byte:
 	xorl %eax,%eax
@@ -261,6 +284,9 @@ copy_user_generic:
 .Le_zero:
 	movq %rdx,%rax
 	jmp .Lende
+	CFI_ENDPROC
+ENDPROC(copy_user_generic)
+
 
 	/* Some CPUs run faster using the string copy instructions.
 	   This is also a lot simpler. Use them when possible.
@@ -270,6 +296,7 @@ copy_user_generic:
  /* rdi	destination
   * rsi source
   * rdx count
+  * ecx zero flag
   *
   * Output:
   * eax uncopied bytes or 0 if successfull.
@@ -280,22 +307,48 @@ copy_user_generic:
   * And more would be dangerous because both Intel and AMD have
   * errata with rep movsq > 4GB. If someone feels the need to fix
   * this please consider this.
-   */
-copy_user_generic_c:
+  */
+ENTRY(copy_user_generic_string)
+	CFI_STARTPROC
+	movl %ecx,%r8d		/* save zero flag */
 	movl %edx,%ecx
 	shrl $3,%ecx
 	andl $7,%edx	
+	jz   10f
 1:	rep 
 	movsq 
 	movl %edx,%ecx
 2:	rep
 	movsb
-4:	movl %ecx,%eax
+9:	movl %ecx,%eax
 	ret
-3:	lea (%rdx,%rcx,8),%rax
+
+	/* multiple of 8 byte */
+10:	rep
+	movsq
+	xor %eax,%eax
 	ret
 
+	/* exception handling */
+3:      lea (%rdx,%rcx,8),%rax	/* exception on quad loop */
+	jmp 6f
+5:	movl %ecx,%eax		/* exception on byte loop */
+	/* eax: left over bytes */
+6:	testl %r8d,%r8d		/* zero flag set? */
+	jz 7f
+	movl %eax,%ecx		/* initialize x86 loop counter */
+	push %rax
+	xorl %eax,%eax
+8:	rep
+	stosb 			/* zero the rest */
+11:	pop %rax
+7:	ret
+	CFI_ENDPROC
+END(copy_user_generic_c)
+
 	.section __ex_table,"a"
 	.quad 1b,3b
-	.quad 2b,4b
+	.quad 2b,5b
+	.quad 8b,11b
+	.quad 10b,3b
 	.previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
index 72fd55ee896e..f0dba36578ea 100644
--- a/arch/x86_64/lib/csum-copy.S
+++ b/arch/x86_64/lib/csum-copy.S
@@ -5,8 +5,9 @@
  * License.  See the file COPYING in the main directory of this archive
  * for more details. No warranty for anything given at all.
  */
- 	#include <linux/linkage.h>
-	#include <asm/errno.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
 
 /*
  * Checksum copy with exception handling.
@@ -53,19 +54,24 @@
 	.endm
 	
 				
-	.globl csum_partial_copy_generic
-	.p2align 4
-csum_partial_copy_generic:
+ENTRY(csum_partial_copy_generic)
+	CFI_STARTPROC
 	cmpl	 $3*64,%edx
 	jle	 .Lignore
 
 .Lignore:		
 	subq  $7*8,%rsp
+	CFI_ADJUST_CFA_OFFSET 7*8
 	movq  %rbx,2*8(%rsp)
+	CFI_REL_OFFSET rbx, 2*8
 	movq  %r12,3*8(%rsp)
+	CFI_REL_OFFSET r12, 3*8
 	movq  %r14,4*8(%rsp)
+	CFI_REL_OFFSET r14, 4*8
 	movq  %r13,5*8(%rsp)
+	CFI_REL_OFFSET r13, 5*8
 	movq  %rbp,6*8(%rsp)
+	CFI_REL_OFFSET rbp, 6*8
 
 	movq  %r8,(%rsp)
 	movq  %r9,1*8(%rsp)
@@ -208,14 +214,22 @@ csum_partial_copy_generic:
 	addl %ebx,%eax
 	adcl %r9d,%eax		/* carry */
 			
+	CFI_REMEMBER_STATE
 .Lende:
 	movq 2*8(%rsp),%rbx
+	CFI_RESTORE rbx
 	movq 3*8(%rsp),%r12
+	CFI_RESTORE r12
 	movq 4*8(%rsp),%r14
+	CFI_RESTORE r14
 	movq 5*8(%rsp),%r13
+	CFI_RESTORE r13
 	movq 6*8(%rsp),%rbp
+	CFI_RESTORE rbp
 	addq $7*8,%rsp
+	CFI_ADJUST_CFA_OFFSET -7*8
 	ret
+	CFI_RESTORE_STATE
 
 	/* Exception handlers. Very simple, zeroing is done in the wrappers */
 .Lbad_source:
@@ -231,3 +245,5 @@ csum_partial_copy_generic:
 	jz   .Lende	
 	movl $-EFAULT,(%rax)
 	jmp .Lende
+	CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
index 3844d5e885a4..5448876261f8 100644
--- a/arch/x86_64/lib/getuser.S
+++ b/arch/x86_64/lib/getuser.S
@@ -27,25 +27,26 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/dwarf2.h>
 #include <asm/page.h>
 #include <asm/errno.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 
 	.text
-	.p2align 4
-.globl __get_user_1
-__get_user_1:	
+ENTRY(__get_user_1)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	cmpq threadinfo_addr_limit(%r8),%rcx
 	jae bad_get_user
 1:	movzb (%rcx),%edx
 	xorl %eax,%eax
 	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_1)
 
-	.p2align 4
-.globl __get_user_2
-__get_user_2:
+ENTRY(__get_user_2)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	addq $1,%rcx
 	jc 20f
@@ -57,10 +58,11 @@ __get_user_2:
 	ret
 20:	decq    %rcx
 	jmp	bad_get_user
+	CFI_ENDPROC
+ENDPROC(__get_user_2)
 
-	.p2align 4
-.globl __get_user_4
-__get_user_4:
+ENTRY(__get_user_4)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	addq $3,%rcx
 	jc 30f
@@ -72,10 +74,11 @@ __get_user_4:
 	ret
 30:	subq $3,%rcx
 	jmp bad_get_user
+	CFI_ENDPROC
+ENDPROC(__get_user_4)
 
-	.p2align 4
-.globl __get_user_8
-__get_user_8:
+ENTRY(__get_user_8)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	addq $7,%rcx
 	jc 40f
@@ -87,11 +90,16 @@ __get_user_8:
 	ret
 40:	subq $7,%rcx
 	jmp bad_get_user
+	CFI_ENDPROC
+ENDPROC(__get_user_8)
 
 bad_get_user:
+	CFI_STARTPROC
 	xorl %edx,%edx
 	movq $(-EFAULT),%rax
 	ret
+	CFI_ENDPROC
+END(bad_get_user)
 
 .section __ex_table,"a"
 	.quad 1b,bad_get_user
diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S
index 8bbade5fea05..05a95e713da8 100644
--- a/arch/x86_64/lib/iomap_copy.S
+++ b/arch/x86_64/lib/iomap_copy.S
@@ -15,12 +15,16 @@
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
 /*
  * override generic version in lib/iomap_copy.c
  */
- 	.globl __iowrite32_copy
-	.p2align 4
-__iowrite32_copy:
+ENTRY(__iowrite32_copy)
+	CFI_STARTPROC
 	movl %edx,%ecx
 	rep movsd
 	ret
+	CFI_ENDPROC
+ENDPROC(__iowrite32_copy)
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 5554948b5554..967b22fa7d07 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -1,6 +1,10 @@
 /* Copyright 2002 Andi Kleen */
 	
-	#include <asm/cpufeature.h>		
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+
 /*
  * memcpy - Copy a memory block.
  *
@@ -13,12 +17,26 @@
  * rax original destination
  */	
 
- 	.globl __memcpy
-	.globl memcpy
-	.p2align 4
-__memcpy:
-memcpy:		
+	ALIGN
+memcpy_c:
+	CFI_STARTPROC
+	movq %rdi,%rax
+	movl %edx,%ecx
+	shrl $3,%ecx
+	andl $7,%edx
+	rep movsq
+	movl %edx,%ecx
+	rep movsb
+	ret
+	CFI_ENDPROC
+ENDPROC(memcpy_c)
+
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	CFI_STARTPROC
 	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
 	movq %rdi,%rax
 
 	movl %edx,%ecx
@@ -86,36 +104,27 @@ memcpy:
 
 .Lende:
 	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
 	ret
 .Lfinal:
+	CFI_ENDPROC
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
 
 	/* Some CPUs run faster using the string copy instructions.
 	   It is also a lot simpler. Use this when possible */
 
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb				/* jmp <disp8> */
+	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
+2:
+	.previous
 	.section .altinstructions,"a"
 	.align 8
-	.quad  memcpy
-	.quad  memcpy_c
-	.byte  X86_FEATURE_REP_GOOD
-	.byte  .Lfinal-memcpy
-	.byte  memcpy_c_end-memcpy_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi source
-  * rdx count
-  */
-memcpy_c:
-	movq %rdi,%rax
-	movl %edx,%ecx
-	shrl $3,%ecx
-	andl $7,%edx	
-	rep 
-	movsq 
-	movl %edx,%ecx
-	rep
-	movsb
-	ret
-memcpy_c_end:
+	.quad memcpy
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lfinal - memcpy
+	.byte 2b - 1b
 	.previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index ad397f2c7de8..09ed1f6b0eaa 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -1,4 +1,9 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
 /*
  * ISO C memset - set a memory block to a byte value.
  *	
@@ -8,11 +13,29 @@
  * 
  * rax   original destination
  */	
- 	.globl __memset
-	.globl memset
-	.p2align 4
-memset:	
-__memset:
+	ALIGN
+memset_c:
+	CFI_STARTPROC
+	movq %rdi,%r9
+	movl %edx,%r8d
+	andl $7,%r8d
+	movl %edx,%ecx
+	shrl $3,%ecx
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	mulq %rsi		/* with rax, clobbers rdx */
+	rep stosq
+	movl %r8d,%ecx
+	rep stosb
+	movq %r9,%rax
+	ret
+	CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+	CFI_STARTPROC
 	movq %rdi,%r10
 	movq %rdx,%r11
 
@@ -25,6 +48,7 @@ __memset:
 	movl  %edi,%r9d
 	andl  $7,%r9d
 	jnz  .Lbad_alignment
+	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
 	movl %r11d,%ecx
@@ -75,6 +99,7 @@ __memset:
 	movq	%r10,%rax
 	ret
 
+	CFI_RESTORE_STATE
 .Lbad_alignment:
 	cmpq $7,%r11
 	jbe	.Lhandle_7
@@ -84,42 +109,26 @@ __memset:
 	addq %r8,%rdi
 	subq %r8,%r11
 	jmp .Lafter_bad_alignment
+.Lfinal:
+	CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
 
 	/* Some CPUs run faster using the string instructions.
 	   It is also a lot simpler. Use this when possible */
 
 #include <asm/cpufeature.h>
 
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb				/* jmp <disp8> */
+	.byte (memset_c - memset) - (2f - 1b)	/* offset */
+2:
+	.previous
 	.section .altinstructions,"a"
 	.align 8
-	.quad  memset
-	.quad  memset_c
-	.byte  X86_FEATURE_REP_GOOD
-	.byte  memset_c_end-memset_c
-	.byte  memset_c_end-memset_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi value
-  * rdx count
-  */
-memset_c:
-	movq %rdi,%r9
-	movl %edx,%r8d
-	andl $7,%r8d		
-	movl %edx,%ecx
-	shrl $3,%ecx		
-	/* expand byte value  */
-	movzbl %sil,%esi
-	movabs $0x0101010101010101,%rax
-	mulq   %rsi		/* with rax, clobbers rdx */
-	rep
-	stosq	
-	movl %r8d,%ecx
-	rep
-	stosb
-	movq %r9,%rax
-	ret
-memset_c_end:
+	.quad memset
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lfinal - memset
+	.byte 2b - 1b
 	.previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
index 7f5593974e2d..4989f5a8fa9b 100644
--- a/arch/x86_64/lib/putuser.S
+++ b/arch/x86_64/lib/putuser.S
@@ -25,25 +25,26 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/dwarf2.h>
 #include <asm/page.h>
 #include <asm/errno.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 
 	.text
-	.p2align 4
-.globl __put_user_1
-__put_user_1:
+ENTRY(__put_user_1)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	cmpq threadinfo_addr_limit(%r8),%rcx
 	jae bad_put_user
 1:	movb %dl,(%rcx)
 	xorl %eax,%eax
 	ret
+	CFI_ENDPROC
+ENDPROC(__put_user_1)
 
-	.p2align 4
-.globl __put_user_2
-__put_user_2:
+ENTRY(__put_user_2)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	addq $1,%rcx
 	jc 20f
@@ -55,10 +56,11 @@ __put_user_2:
 	ret
 20:	decq %rcx
 	jmp bad_put_user
+	CFI_ENDPROC
+ENDPROC(__put_user_2)
 
-	.p2align 4
-.globl __put_user_4
-__put_user_4:
+ENTRY(__put_user_4)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	addq $3,%rcx
 	jc 30f
@@ -70,10 +72,11 @@ __put_user_4:
 	ret
 30:	subq $3,%rcx
 	jmp bad_put_user
+	CFI_ENDPROC
+ENDPROC(__put_user_4)
 
-	.p2align 4
-.globl __put_user_8
-__put_user_8:
+ENTRY(__put_user_8)
+	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
 	addq $7,%rcx
 	jc 40f
@@ -85,10 +88,15 @@ __put_user_8:
 	ret
 40:	subq $7,%rcx
 	jmp bad_put_user
+	CFI_ENDPROC
+ENDPROC(__put_user_8)
 
 bad_put_user:
+	CFI_STARTPROC
 	movq $(-EFAULT),%rax
 	ret
+	CFI_ENDPROC
+END(bad_put_user)
 
 .section __ex_table,"a"
 	.quad 1b,bad_put_user
diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S
new file mode 100644
index 000000000000..0cde1f807314
--- /dev/null
+++ b/arch/x86_64/lib/rwlock.S
@@ -0,0 +1,38 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/dwarf2.h>
+
+/* rdi:	pointer to rwlock_t */
+ENTRY(__write_lock_failed)
+	CFI_STARTPROC
+	LOCK_PREFIX
+	addl $RW_LOCK_BIAS,(%rdi)
+1:	rep
+	nop
+	cmpl $RW_LOCK_BIAS,(%rdi)
+	jne 1b
+	LOCK_PREFIX
+	subl $RW_LOCK_BIAS,(%rdi)
+	jnz  __write_lock_failed
+	ret
+	CFI_ENDPROC
+END(__write_lock_failed)
+
+/* rdi:	pointer to rwlock_t */
+ENTRY(__read_lock_failed)
+	CFI_STARTPROC
+	LOCK_PREFIX
+	incl (%rdi)
+1:	rep
+	nop
+	cmpl $1,(%rdi)
+	js 1b
+	LOCK_PREFIX
+	decl (%rdi)
+	js __read_lock_failed
+	ret
+	CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
index 332ea5dff916..0025535cac8d 100644
--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -1,10 +1,9 @@
-	/*
-	 * Save registers before calling assembly functions. This avoids
-	 * disturbance of register allocation in some inline assembly constructs.
-	 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
-	 * Subject to the GNU public license, v.2. No warranty of any kind.
-	 * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $
-	 */
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
 
 	#include <linux/config.h>
 	#include <linux/linkage.h>
@@ -67,33 +66,3 @@ restore_norax:
 	RESTORE_ARGS 1
 	ret
 	CFI_ENDPROC
-
-#ifdef CONFIG_SMP
-/* Support for read/write spinlocks. */
-	.text
-/* rax:	pointer to rwlock_t */	
-ENTRY(__write_lock_failed)
-	lock
-	addl $RW_LOCK_BIAS,(%rax)
-1:	rep
-	nop
-	cmpl $RW_LOCK_BIAS,(%rax)
-	jne 1b
-	lock 
-	subl $RW_LOCK_BIAS,(%rax)
-	jnz  __write_lock_failed
-	ret
-
-/* rax:	pointer to rwlock_t */	
-ENTRY(__read_lock_failed)
-	lock
-	incl (%rax)
-1:	rep
-	nop
-	cmpl $1,(%rax)
-	js 1b
-	lock
-	decl (%rax)
-	js __read_lock_failed
-	ret
-#endif