M7350v1_en_gpl

This commit is contained in:
T
2024-09-09 08:52:07 +00:00
commit f9cc65cfda
65988 changed files with 26357421 additions and 0 deletions
+47
View File
@@ -0,0 +1,47 @@
#
# Makefile for x86 specific library files.
#
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
$(call cmd,inat_tables)
$(obj)/inat.o: $(obj)/inat-tables.c
clean-files := inat-tables.c
obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
lib-$(CONFIG_SMP) += rwlock.o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
obj-y += msr.o msr-reg.o msr-reg-export.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += string_32.o
lib-y += cmpxchg.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o copy_user_nocache_64.o
lib-y += cmpxchg16b_emu.o
endif
+4
View File
@@ -0,0 +1,4 @@
#define ATOMIC64_EXPORT EXPORT_SYMBOL
#include <linux/export.h>
#include <linux/atomic.h>
+194
View File
@@ -0,0 +1,194 @@
/*
* atomic64_t for 386/486
*
* Copyright © 2010 Luca Barbieri
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
#include <asm/dwarf2.h>
/* if you want SMP support, implement these with real spinlocks */
.macro LOCK reg
pushfl_cfi
cli
.endm
.macro UNLOCK reg
popfl_cfi
.endm
#define BEGIN(op) \
.macro endp; \
CFI_ENDPROC; \
ENDPROC(atomic64_##op##_386); \
.purgem endp; \
.endm; \
ENTRY(atomic64_##op##_386); \
CFI_STARTPROC; \
LOCK v;
#define ENDP endp
#define RET \
UNLOCK v; \
ret
#define RET_ENDP \
RET; \
ENDP
#define v %ecx
BEGIN(read)
movl (v), %eax
movl 4(v), %edx
RET_ENDP
#undef v
#define v %esi
BEGIN(set)
movl %ebx, (v)
movl %ecx, 4(v)
RET_ENDP
#undef v
#define v %esi
BEGIN(xchg)
movl (v), %eax
movl 4(v), %edx
movl %ebx, (v)
movl %ecx, 4(v)
RET_ENDP
#undef v
#define v %ecx
BEGIN(add)
addl %eax, (v)
adcl %edx, 4(v)
RET_ENDP
#undef v
#define v %ecx
BEGIN(add_return)
addl (v), %eax
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
RET_ENDP
#undef v
#define v %ecx
BEGIN(sub)
subl %eax, (v)
sbbl %edx, 4(v)
RET_ENDP
#undef v
#define v %ecx
BEGIN(sub_return)
negl %edx
negl %eax
sbbl $0, %edx
addl (v), %eax
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
RET_ENDP
#undef v
#define v %esi
BEGIN(inc)
addl $1, (v)
adcl $0, 4(v)
RET_ENDP
#undef v
#define v %esi
BEGIN(inc_return)
movl (v), %eax
movl 4(v), %edx
addl $1, %eax
adcl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
RET_ENDP
#undef v
#define v %esi
BEGIN(dec)
subl $1, (v)
sbbl $0, 4(v)
RET_ENDP
#undef v
#define v %esi
BEGIN(dec_return)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
RET_ENDP
#undef v
#define v %esi
BEGIN(add_unless)
addl %eax, %ecx
adcl %edx, %edi
addl (v), %eax
adcl 4(v), %edx
cmpl %eax, %ecx
je 3f
1:
movl %eax, (v)
movl %edx, 4(v)
movl $1, %eax
2:
RET
3:
cmpl %edx, %edi
jne 1b
xorl %eax, %eax
jmp 2b
ENDP
#undef v
#define v %esi
BEGIN(inc_not_zero)
movl (v), %eax
movl 4(v), %edx
testl %eax, %eax
je 3f
1:
addl $1, %eax
adcl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
movl $1, %eax
2:
RET
3:
testl %edx, %edx
jne 1b
jmp 2b
ENDP
#undef v
#define v %esi
BEGIN(dec_if_positive)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
js 1f
movl %eax, (v)
movl %edx, 4(v)
1:
RET_ENDP
#undef v
+215
View File
@@ -0,0 +1,215 @@
/*
* atomic64_t for 586+
*
* Copyright © 2010 Luca Barbieri
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
#include <asm/dwarf2.h>
.macro SAVE reg
pushl_cfi %\reg
CFI_REL_OFFSET \reg, 0
.endm
.macro RESTORE reg
popl_cfi %\reg
CFI_RESTORE \reg
.endm
.macro read64 reg
movl %ebx, %eax
movl %ecx, %edx
/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
LOCK_PREFIX
cmpxchg8b (\reg)
.endm
ENTRY(atomic64_read_cx8)
CFI_STARTPROC
read64 %ecx
ret
CFI_ENDPROC
ENDPROC(atomic64_read_cx8)
ENTRY(atomic64_set_cx8)
CFI_STARTPROC
1:
/* we don't need LOCK_PREFIX since aligned 64-bit writes
* are atomic on 586 and newer */
cmpxchg8b (%esi)
jne 1b
ret
CFI_ENDPROC
ENDPROC(atomic64_set_cx8)
ENTRY(atomic64_xchg_cx8)
CFI_STARTPROC
1:
LOCK_PREFIX
cmpxchg8b (%esi)
jne 1b
ret
CFI_ENDPROC
ENDPROC(atomic64_xchg_cx8)
.macro addsub_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
CFI_STARTPROC
SAVE ebp
SAVE ebx
SAVE esi
SAVE edi
movl %eax, %esi
movl %edx, %edi
movl %ecx, %ebp
read64 %ecx
1:
movl %eax, %ebx
movl %edx, %ecx
\ins\()l %esi, %ebx
\insc\()l %edi, %ecx
LOCK_PREFIX
cmpxchg8b (%ebp)
jne 1b
10:
movl %ebx, %eax
movl %ecx, %edx
RESTORE edi
RESTORE esi
RESTORE ebx
RESTORE ebp
ret
CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
.endm
addsub_return add add adc
addsub_return sub sub sbb
.macro incdec_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
CFI_STARTPROC
SAVE ebx
read64 %esi
1:
movl %eax, %ebx
movl %edx, %ecx
\ins\()l $1, %ebx
\insc\()l $0, %ecx
LOCK_PREFIX
cmpxchg8b (%esi)
jne 1b
10:
movl %ebx, %eax
movl %ecx, %edx
RESTORE ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
.endm
incdec_return inc add adc
incdec_return dec sub sbb
ENTRY(atomic64_dec_if_positive_cx8)
CFI_STARTPROC
SAVE ebx
read64 %esi
1:
movl %eax, %ebx
movl %edx, %ecx
subl $1, %ebx
sbb $0, %ecx
js 2f
LOCK_PREFIX
cmpxchg8b (%esi)
jne 1b
2:
movl %ebx, %eax
movl %ecx, %edx
RESTORE ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_dec_if_positive_cx8)
ENTRY(atomic64_add_unless_cx8)
CFI_STARTPROC
SAVE ebp
SAVE ebx
/* these just push these two parameters on the stack */
SAVE edi
SAVE ecx
movl %eax, %ebp
movl %edx, %edi
read64 %esi
1:
cmpl %eax, 0(%esp)
je 4f
2:
movl %eax, %ebx
movl %edx, %ecx
addl %ebp, %ebx
adcl %edi, %ecx
LOCK_PREFIX
cmpxchg8b (%esi)
jne 1b
movl $1, %eax
3:
addl $8, %esp
CFI_ADJUST_CFA_OFFSET -8
RESTORE ebx
RESTORE ebp
ret
4:
cmpl %edx, 4(%esp)
jne 2b
xorl %eax, %eax
jmp 3b
CFI_ENDPROC
ENDPROC(atomic64_add_unless_cx8)
ENTRY(atomic64_inc_not_zero_cx8)
CFI_STARTPROC
SAVE ebx
read64 %esi
1:
movl %eax, %ecx
orl %edx, %ecx
jz 3f
movl %eax, %ebx
xorl %ecx, %ecx
addl $1, %ebx
adcl %edx, %ecx
LOCK_PREFIX
cmpxchg8b (%esi)
jne 1b
movl $1, %eax
3:
RESTORE ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_inc_not_zero_cx8)
+19
View File
@@ -0,0 +1,19 @@
#include <linux/smp.h>
#include <linux/module.h>
static void __wbinvd(void *dummy)
{
wbinvd();
}
void wbinvd_on_cpu(int cpu)
{
smp_call_function_single(cpu, __wbinvd, NULL, 1);
}
EXPORT_SYMBOL(wbinvd_on_cpu);
int wbinvd_on_all_cpus(void)
{
return on_each_cpu(__wbinvd, NULL, 1);
}
EXPORT_SYMBOL(wbinvd_on_all_cpus);
+525
View File
@@ -0,0 +1,525 @@
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IP/TCP/UDP checksumming routines
*
* Authors: Jorge Cwik, <jorge@laser.satlink.net>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Tom May, <ftom@netcom.com>
* Pentium Pro/II routines:
* Alexander Kjeldaas <astor@guardian.no>
* Finn Arne Gangstad <finnag@guardian.no>
* Lots of code moved from tcp.c and ip.c; see those files
* for more names.
*
* Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
* handling.
* Andi Kleen, add zeroing on error
* converted to pure assembler
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/errno.h>
/*
* computes a partial checksum, e.g. for TCP/UDP fragments
*/
/*
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/
.text
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
/*
* Experiments with Ethernet and SLIP connections show that buff
* is aligned on either a 2-byte or 4-byte boundary. We get at
* least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
* Fortunately, it is easy to convert 2-byte alignment to 4-byte
* alignment for the unrolled loop.
*/
ENTRY(csum_partial)
CFI_STARTPROC
pushl_cfi %esi
CFI_REL_OFFSET esi, 0
pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
testl $3, %esi # Check alignment.
jz 2f # Jump if alignment is ok.
testl $1, %esi # Check alignment.
jz 10f # Jump if alignment is boundary of 2bytes.
# buf is odd
dec %ecx
jl 8f
movzbl (%esi), %ebx
adcl %ebx, %eax
roll $8, %eax
inc %esi
testl $2, %esi
jz 2f
10:
subl $2, %ecx # Alignment uses up two bytes.
jae 1f # Jump if we had at least two bytes.
addl $2, %ecx # ecx was < 2. Deal with it.
jmp 4f
1: movw (%esi), %bx
addl $2, %esi
addw %bx, %ax
adcl $0, %eax
2:
movl %ecx, %edx
shrl $5, %ecx
jz 2f
testl %esi, %esi
1: movl (%esi), %ebx
adcl %ebx, %eax
movl 4(%esi), %ebx
adcl %ebx, %eax
movl 8(%esi), %ebx
adcl %ebx, %eax
movl 12(%esi), %ebx
adcl %ebx, %eax
movl 16(%esi), %ebx
adcl %ebx, %eax
movl 20(%esi), %ebx
adcl %ebx, %eax
movl 24(%esi), %ebx
adcl %ebx, %eax
movl 28(%esi), %ebx
adcl %ebx, %eax
lea 32(%esi), %esi
dec %ecx
jne 1b
adcl $0, %eax
2: movl %edx, %ecx
andl $0x1c, %edx
je 4f
shrl $2, %edx # This clears CF
3: adcl (%esi), %eax
lea 4(%esi), %esi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx
jz 7f
cmpl $2, %ecx
jb 5f
movw (%esi),%cx
leal 2(%esi),%esi
je 6f
shll $16,%ecx
5: movb (%esi),%cl
6: addl %ecx,%eax
adcl $0, %eax
7:
testl $1, 12(%esp)
jz 8f
roll $8, %eax
8:
popl_cfi %ebx
CFI_RESTORE ebx
popl_cfi %esi
CFI_RESTORE esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
#else
/* Version for PentiumII/PPro */
ENTRY(csum_partial)
CFI_STARTPROC
pushl_cfi %esi
CFI_REL_OFFSET esi, 0
pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
testl $3, %esi
jnz 25f
10:
movl %ecx, %edx
movl %ecx, %ebx
andl $0x7c, %ebx
shrl $7, %ecx
addl %ebx,%esi
shrl $2, %ebx
negl %ebx
lea 45f(%ebx,%ebx,2), %ebx
testl %esi, %esi
jmp *%ebx
# Handle 2-byte-aligned regions
20: addw (%esi), %ax
lea 2(%esi), %esi
adcl $0, %eax
jmp 10b
25:
testl $1, %esi
jz 30f
# buf is odd
dec %ecx
jl 90f
movzbl (%esi), %ebx
addl %ebx, %eax
adcl $0, %eax
roll $8, %eax
inc %esi
testl $2, %esi
jz 10b
30: subl $2, %ecx
ja 20b
je 32f
addl $2, %ecx
jz 80f
movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
addl %ebx, %eax
adcl $0, %eax
jmp 80f
32:
addw (%esi), %ax # csumming 2 bytes, 2-aligned
adcl $0, %eax
jmp 80f
40:
addl -128(%esi), %eax
adcl -124(%esi), %eax
adcl -120(%esi), %eax
adcl -116(%esi), %eax
adcl -112(%esi), %eax
adcl -108(%esi), %eax
adcl -104(%esi), %eax
adcl -100(%esi), %eax
adcl -96(%esi), %eax
adcl -92(%esi), %eax
adcl -88(%esi), %eax
adcl -84(%esi), %eax
adcl -80(%esi), %eax
adcl -76(%esi), %eax
adcl -72(%esi), %eax
adcl -68(%esi), %eax
adcl -64(%esi), %eax
adcl -60(%esi), %eax
adcl -56(%esi), %eax
adcl -52(%esi), %eax
adcl -48(%esi), %eax
adcl -44(%esi), %eax
adcl -40(%esi), %eax
adcl -36(%esi), %eax
adcl -32(%esi), %eax
adcl -28(%esi), %eax
adcl -24(%esi), %eax
adcl -20(%esi), %eax
adcl -16(%esi), %eax
adcl -12(%esi), %eax
adcl -8(%esi), %eax
adcl -4(%esi), %eax
45:
lea 128(%esi), %esi
adcl $0, %eax
dec %ecx
jge 40b
movl %edx, %ecx
50: andl $3, %ecx
jz 80f
# Handle the last 1-3 bytes without jumping
notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
movl $0xffffff,%ebx # by the shll and shrl instructions
shll $3,%ecx
shrl %cl,%ebx
andl -128(%esi),%ebx # esi is 4-aligned so should be ok
addl %ebx,%eax
adcl $0,%eax
80:
testl $1, 12(%esp)
jz 90f
roll $8, %eax
90:
popl_cfi %ebx
CFI_RESTORE ebx
popl_cfi %esi
CFI_RESTORE esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
#endif
/*
unsigned int csum_partial_copy_generic (const char *src, char *dst,
int len, int sum, int *src_err_ptr, int *dst_err_ptr)
*/
/*
* Copy from ds while checksumming, otherwise like csum_partial
*
* The macros SRC and DST specify the type of access for the instruction.
* thus we can call a custom exception handler for all access types.
*
* FIXME: could someone double-check whether I haven't mixed up some SRC and
* DST definitions? It's damn hard to trigger all cases. I hope I got
* them all but there's no guarantee.
*/
#define SRC(y...) \
9999: y; \
.section __ex_table, "a"; \
.long 9999b, 6001f ; \
.previous
#define DST(y...) \
9999: y; \
.section __ex_table, "a"; \
.long 9999b, 6002f ; \
.previous
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
#define ARGBASE 16
#define FP 12
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
subl $4,%esp
CFI_ADJUST_CFA_OFFSET 4
pushl_cfi %edi
CFI_REL_OFFSET edi, 0
pushl_cfi %esi
CFI_REL_OFFSET esi, 0
pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
movl ARGBASE+8(%esp),%edi # dst
testl $2, %edi # Check alignment.
jz 2f # Jump if alignment is ok.
subl $2, %ecx # Alignment uses up two bytes.
jae 1f # Jump if we had at least two bytes.
addl $2, %ecx # ecx was < 2. Deal with it.
jmp 4f
SRC(1: movw (%esi), %bx )
addl $2, %esi
DST( movw %bx, (%edi) )
addl $2, %edi
addw %bx, %ax
adcl $0, %eax
2:
movl %ecx, FP(%esp)
shrl $5, %ecx
jz 2f
testl %esi, %esi
SRC(1: movl (%esi), %ebx )
SRC( movl 4(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
adcl %edx, %eax
DST( movl %edx, 4(%edi) )
SRC( movl 8(%esi), %ebx )
SRC( movl 12(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 8(%edi) )
adcl %edx, %eax
DST( movl %edx, 12(%edi) )
SRC( movl 16(%esi), %ebx )
SRC( movl 20(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 16(%edi) )
adcl %edx, %eax
DST( movl %edx, 20(%edi) )
SRC( movl 24(%esi), %ebx )
SRC( movl 28(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 24(%edi) )
adcl %edx, %eax
DST( movl %edx, 28(%edi) )
lea 32(%esi), %esi
lea 32(%edi), %edi
dec %ecx
jne 1b
adcl $0, %eax
2: movl FP(%esp), %edx
movl %edx, %ecx
andl $0x1c, %edx
je 4f
shrl $2, %edx # This clears CF
SRC(3: movl (%esi), %ebx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
lea 4(%esi), %esi
lea 4(%edi), %edi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx
jz 7f
cmpl $2, %ecx
jb 5f
SRC( movw (%esi), %cx )
leal 2(%esi), %esi
DST( movw %cx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%ecx
SRC(5: movb (%esi), %cl )
DST( movb %cl, (%edi) )
6: addl %ecx, %eax
adcl $0, %eax
7:
5000:
# Exception handler:
.section .fixup, "ax"
6001:
movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx)
# zero the complete destination - computing the rest
# is too much work
movl ARGBASE+8(%esp), %edi # dst
movl ARGBASE+12(%esp), %ecx # len
xorl %eax,%eax
rep ; stosb
jmp 5000b
6002:
movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT,(%ebx)
jmp 5000b
.previous
popl_cfi %ebx
CFI_RESTORE ebx
popl_cfi %esi
CFI_RESTORE esi
popl_cfi %edi
CFI_RESTORE edi
popl_cfi %ecx # equivalent to addl $4,%esp
ret
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
#else
/* Version for PentiumII/PPro */
#define ROUND1(x) \
SRC(movl x(%esi), %ebx ) ; \
addl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;
#define ROUND(x) \
SRC(movl x(%esi), %ebx ) ; \
adcl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;
#define ARGBASE 12
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
pushl_cfi %edi
CFI_REL_OFFSET edi, 0
pushl_cfi %esi
CFI_REL_OFFSET esi, 0
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
movl ARGBASE+16(%esp),%eax #sum
# movl %ecx, %edx
movl %ecx, %ebx
movl %esi, %edx
shrl $6, %ecx
andl $0x3c, %ebx
negl %ebx
subl %ebx, %esi
subl %ebx, %edi
lea -1(%esi),%edx
andl $-32,%edx
lea 3f(%ebx,%ebx), %ebx
testl %esi, %esi
jmp *%ebx
1: addl $64,%esi
addl $64,%edi
SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
3: adcl $0,%eax
addl $64, %edx
dec %ecx
jge 1b
4: movl ARGBASE+12(%esp),%edx #len
andl $3, %edx
jz 7f
cmpl $2, %edx
jb 5f
SRC( movw (%esi), %dx )
leal 2(%esi), %esi
DST( movw %dx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%edx
5:
SRC( movb (%esi), %dl )
DST( movb %dl, (%edi) )
6: addl %edx, %eax
adcl $0, %eax
7:
.section .fixup, "ax"
6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx)
# zero the complete destination (computing the rest is too much work)
movl ARGBASE+8(%esp),%edi # dst
movl ARGBASE+12(%esp),%ecx # len
xorl %eax,%eax
rep; stosb
jmp 7b
6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT, (%ebx)
jmp 7b
.previous
popl_cfi %esi
CFI_RESTORE esi
popl_cfi %edi
CFI_RESTORE edi
popl_cfi %ebx
CFI_RESTORE ebx
ret
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
#undef ROUND
#undef ROUND1
#endif
+73
View File
@@ -0,0 +1,73 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/*
* Zero a page.
* rdi page
*/
ENTRY(clear_page_c)
CFI_STARTPROC
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
CFI_ENDPROC
ENDPROC(clear_page_c)
ENTRY(clear_page_c_e)
CFI_STARTPROC
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret
CFI_ENDPROC
ENDPROC(clear_page_c_e)
ENTRY(clear_page)
CFI_STARTPROC
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
#define PUT(x) movq %rax,x*8(%rdi)
movq %rax,(%rdi)
PUT(1)
PUT(2)
PUT(3)
PUT(4)
PUT(5)
PUT(6)
PUT(7)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
ret
CFI_ENDPROC
.Lclear_page_end:
ENDPROC(clear_page)
/*
* Some CPUs support enhanced REP MOVSB/STOSB instructions.
* It is recommended to use this when possible.
* If enhanced REP MOVSB/STOSB is not available, try to use fast string.
* Otherwise, use original function.
*
*/
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
2: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
3:
.previous
.section .altinstructions,"a"
altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
.Lclear_page_end-clear_page, 2b-1b
altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
.Lclear_page_end-clear_page,3b-2b
.previous
+54
View File
@@ -0,0 +1,54 @@
/*
* cmpxchg*() fallbacks for CPU not supporting these instructions
*/
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/module.h>
#ifndef CONFIG_X86_CMPXCHG
unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
{
u8 prev;
unsigned long flags;
/* Poor man's cmpxchg for 386. Unsuitable for SMP */
local_irq_save(flags);
prev = *(u8 *)ptr;
if (prev == old)
*(u8 *)ptr = new;
local_irq_restore(flags);
return prev;
}
EXPORT_SYMBOL(cmpxchg_386_u8);
unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
{
u16 prev;
unsigned long flags;
/* Poor man's cmpxchg for 386. Unsuitable for SMP */
local_irq_save(flags);
prev = *(u16 *)ptr;
if (prev == old)
*(u16 *)ptr = new;
local_irq_restore(flags);
return prev;
}
EXPORT_SYMBOL(cmpxchg_386_u16);
unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
{
u32 prev;
unsigned long flags;
/* Poor man's cmpxchg for 386. Unsuitable for SMP */
local_irq_save(flags);
prev = *(u32 *)ptr;
if (prev == old)
*(u32 *)ptr = new;
local_irq_restore(flags);
return prev;
}
EXPORT_SYMBOL(cmpxchg_386_u32);
#endif
+65
View File
@@ -0,0 +1,65 @@
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
*/
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
#include <asm/frame.h>
#include <asm/dwarf2.h>
#ifdef CONFIG_SMP
#define SEG_PREFIX %gs:
#else
#define SEG_PREFIX
#endif
.text
/*
* Inputs:
* %rsi : memory location to compare
* %rax : low 64 bits of old value
* %rdx : high 64 bits of old value
* %rbx : low 64 bits of new value
* %rcx : high 64 bits of new value
* %al : Operation successful
*/
ENTRY(this_cpu_cmpxchg16b_emu)
CFI_STARTPROC
#
# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
# via the ZF. Caller will access %al to get result.
#
# Note that this is only useful for a cpuops operation. Meaning that we
# do *not* have a fully atomic operation but just an operation that is
# *atomic* on a single cpu (as provided by the this_cpu_xx class of
# macros).
#
this_cpu_cmpxchg16b_emu:
pushf
cli
cmpq SEG_PREFIX(%rsi), %rax
jne not_same
cmpq SEG_PREFIX 8(%rsi), %rdx
jne not_same
movq %rbx, SEG_PREFIX(%rsi)
movq %rcx, SEG_PREFIX 8(%rsi)
popf
mov $1, %al
ret
not_same:
popf
xor %al,%al
ret
CFI_ENDPROC
ENDPROC(this_cpu_cmpxchg16b_emu)
+57
View File
@@ -0,0 +1,57 @@
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
*/
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
#include <asm/frame.h>
#include <asm/dwarf2.h>
.text
/*
* Inputs:
* %esi : memory location to compare
* %eax : low 32 bits of old value
* %edx : high 32 bits of old value
* %ebx : low 32 bits of new value
* %ecx : high 32 bits of new value
*/
ENTRY(cmpxchg8b_emu)
CFI_STARTPROC
#
# Emulate 'cmpxchg8b (%esi)' on UP except we don't
# set the whole ZF thing (caller will just compare
# eax:edx with the expected value)
#
cmpxchg8b_emu:
pushfl
cli
cmpl (%esi), %eax
jne not_same
cmpl 4(%esi), %edx
jne half_same
movl %ebx, (%esi)
movl %ecx, 4(%esi)
popfl
ret
not_same:
movl (%esi), %eax
half_same:
movl 4(%esi), %edx
popfl
ret
CFI_ENDPROC
ENDPROC(cmpxchg8b_emu)
+112
View File
@@ -0,0 +1,112 @@
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
ALIGN
copy_page_c:
CFI_STARTPROC
movl $4096/8,%ecx
rep movsq
ret
CFI_ENDPROC
ENDPROC(copy_page_c)
/* Don't use streaming store because it's better when the target
ends up in cache. */
/* Could vary the prefetch distance based on SMP/UP */
ENTRY(copy_page)
CFI_STARTPROC
subq $2*8,%rsp
CFI_ADJUST_CFA_OFFSET 2*8
movq %rbx,(%rsp)
CFI_REL_OFFSET rbx, 0
movq %r12,1*8(%rsp)
CFI_REL_OFFSET r12, 1*8
movl $(4096/64)-5,%ecx
.p2align 4
.Loop64:
dec %rcx
movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12
prefetcht0 5*64(%rsi)
movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)
leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi
jnz .Loop64
movl $5,%ecx
.p2align 4
.Loop2:
decl %ecx
movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12
movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)
leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi
jnz .Loop2
movq (%rsp),%rbx
CFI_RESTORE rbx
movq 1*8(%rsp),%r12
CFI_RESTORE r12
addq $2*8,%rsp
CFI_ADJUST_CFA_OFFSET -2*8
ret
.Lcopy_page_end:
CFI_ENDPROC
ENDPROC(copy_page)
/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
2:
.previous
.section .altinstructions,"a"
altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
.Lcopy_page_end-copy_page, 2b-1b
.previous
+302
View File
@@ -0,0 +1,302 @@
/*
* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
* Copyright 2002 Andi Kleen, SuSE Labs.
* Subject to the GNU Public License v2.
*
* Functions to copy from and to user space.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#define FIX_ALIGNMENT 1
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
/*
* By placing feature2 after feature1 in altinstructions section, we logically
* implement:
* If CPU has feature2, jmp to alt2 is used
* else if CPU has feature1, jmp to alt1 is used
* else jmp to orig is used.
*/
.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
.byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */
1:
.section .altinstr_replacement,"ax"
2: .byte 0xe9 /* near jump with 32bit immediate */
.long \alt1-1b /* offset */ /* or alternatively to alt1 */
3: .byte 0xe9 /* near jump with 32bit immediate */
.long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous
.section .altinstructions,"a"
altinstruction_entry 0b,2b,\feature1,5,5
altinstruction_entry 0b,3b,\feature2,5,5
.previous
.endm
.macro ALIGN_DESTINATION
#ifdef FIX_ALIGNMENT
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
jz 102f /* already aligned */
subl $8,%ecx
negl %ecx
subl %ecx,%edx
100: movb (%rsi),%al
101: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 100b
102:
.section .fixup,"ax"
103: addl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
.section __ex_table,"a"
.align 8
.quad 100b,103b
.quad 101b,103b
.previous
#endif
.endm
/* Standard copy_to_user with segment limit checking */
ENTRY(_copy_to_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rdi,%rcx
addq %rdx,%rcx
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_to_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
copy_user_generic_unrolled,copy_user_generic_string, \
copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_to_user)
/* Standard copy_from_user with segment limit checking */
ENTRY(_copy_from_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rsi,%rcx
addq %rdx,%rcx
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_from_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
copy_user_generic_unrolled,copy_user_generic_string, \
copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_from_user)
.section .fixup,"ax"
/* must zero dest */
ENTRY(bad_from_user)
bad_from_user:
CFI_STARTPROC
movl %edx,%ecx
xorl %eax,%eax
rep
stosb
bad_to_user:
movl %edx,%eax
ret
CFI_ENDPROC
ENDPROC(bad_from_user)
.previous
/*
* copy_user_generic_unrolled - memory copy with exception handling.
* This version is for CPUs like P4 that don't have efficient micro
* code for rep movsq
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_generic_unrolled)
CFI_STARTPROC
cmpl $8,%edx
jb 20f /* less then 8 bytes, go to byte copy loop */
ALIGN_DESTINATION
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
jz 17f
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
4: movq 3*8(%rsi),%r11
5: movq %r8,(%rdi)
6: movq %r9,1*8(%rdi)
7: movq %r10,2*8(%rdi)
8: movq %r11,3*8(%rdi)
9: movq 4*8(%rsi),%r8
10: movq 5*8(%rsi),%r9
11: movq 6*8(%rsi),%r10
12: movq 7*8(%rsi),%r11
13: movq %r8,4*8(%rdi)
14: movq %r9,5*8(%rdi)
15: movq %r10,6*8(%rdi)
16: movq %r11,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
17: movl %edx,%ecx
andl $7,%edx
shrl $3,%ecx
jz 20f
18: movq (%rsi),%r8
19: movq %r8,(%rdi)
leaq 8(%rsi),%rsi
leaq 8(%rdi),%rdi
decl %ecx
jnz 18b
20: andl %edx,%edx
jz 23f
movl %edx,%ecx
21: movb (%rsi),%al
22: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 21b
23: xor %eax,%eax
ret
.section .fixup,"ax"
30: shll $6,%ecx
addl %ecx,%edx
jmp 60f
40: lea (%rdx,%rcx,8),%rdx
jmp 60f
50: movl %ecx,%edx
60: jmp copy_user_handle_tail /* ecx is zerorest also */
.previous
.section __ex_table,"a"
.align 8
.quad 1b,30b
.quad 2b,30b
.quad 3b,30b
.quad 4b,30b
.quad 5b,30b
.quad 6b,30b
.quad 7b,30b
.quad 8b,30b
.quad 9b,30b
.quad 10b,30b
.quad 11b,30b
.quad 12b,30b
.quad 13b,30b
.quad 14b,30b
.quad 15b,30b
.quad 16b,30b
.quad 18b,40b
.quad 19b,40b
.quad 21b,50b
.quad 22b,50b
.previous
CFI_ENDPROC
ENDPROC(copy_user_generic_unrolled)
/* Some CPUs run faster using the string copy instructions.
* This is also a lot simpler. Use them when possible.
*
* Only 4GB of copy is supported. This shouldn't be a problem
* because the kernel normally only writes from/to page sized chunks
* even if user space passed a longer buffer.
* And more would be dangerous because both Intel and AMD have
* errata with rep movsq > 4GB. If someone feels the need to fix
* this please consider this.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_generic_string)
CFI_STARTPROC
andl %edx,%edx
jz 4f
cmpl $8,%edx
jb 2f /* less than 8 bytes, go to byte copy loop */
ALIGN_DESTINATION
movl %edx,%ecx
shrl $3,%ecx
andl $7,%edx
1: rep
movsq
2: movl %edx,%ecx
3: rep
movsb
4: xorl %eax,%eax
ret
.section .fixup,"ax"
11: lea (%rdx,%rcx,8),%rcx
12: movl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
.section __ex_table,"a"
.align 8
.quad 1b,11b
.quad 3b,12b
.previous
CFI_ENDPROC
ENDPROC(copy_user_generic_string)
/*
* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
* It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_enhanced_fast_string)
CFI_STARTPROC
andl %edx,%edx
jz 2f
movl %edx,%ecx
1: rep
movsb
2: xorl %eax,%eax
ret
.section .fixup,"ax"
12: movl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
.section __ex_table,"a"
.align 8
.quad 1b,12b
.previous
CFI_ENDPROC
ENDPROC(copy_user_enhanced_fast_string)
+137
View File
@@ -0,0 +1,137 @@
/*
* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
* Copyright 2002 Andi Kleen, SuSE Labs.
* Subject to the GNU Public License v2.
*
* Functions to copy from and to user space.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#define FIX_ALIGNMENT 1
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
.macro ALIGN_DESTINATION
#ifdef FIX_ALIGNMENT
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
jz 102f /* already aligned */
subl $8,%ecx
negl %ecx
subl %ecx,%edx
100: movb (%rsi),%al
101: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 100b
102:
.section .fixup,"ax"
103: addl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
.section __ex_table,"a"
.align 8
.quad 100b,103b
.quad 101b,103b
.previous
#endif
.endm
/*
* copy_user_nocache - Uncached memory copy with exception handling
* This will force destination/source out of cache for more performance.
*/
ENTRY(__copy_user_nocache)
CFI_STARTPROC
cmpl $8,%edx
jb 20f /* less then 8 bytes, go to byte copy loop */
ALIGN_DESTINATION
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
jz 17f
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
4: movq 3*8(%rsi),%r11
5: movnti %r8,(%rdi)
6: movnti %r9,1*8(%rdi)
7: movnti %r10,2*8(%rdi)
8: movnti %r11,3*8(%rdi)
9: movq 4*8(%rsi),%r8
10: movq 5*8(%rsi),%r9
11: movq 6*8(%rsi),%r10
12: movq 7*8(%rsi),%r11
13: movnti %r8,4*8(%rdi)
14: movnti %r9,5*8(%rdi)
15: movnti %r10,6*8(%rdi)
16: movnti %r11,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
17: movl %edx,%ecx
andl $7,%edx
shrl $3,%ecx
jz 20f
18: movq (%rsi),%r8
19: movnti %r8,(%rdi)
leaq 8(%rsi),%rsi
leaq 8(%rdi),%rdi
decl %ecx
jnz 18b
20: andl %edx,%edx
jz 23f
movl %edx,%ecx
21: movb (%rsi),%al
22: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 21b
23: xorl %eax,%eax
sfence
ret
.section .fixup,"ax"
30: shll $6,%ecx
addl %ecx,%edx
jmp 60f
40: lea (%rdx,%rcx,8),%rdx
jmp 60f
50: movl %ecx,%edx
60: sfence
jmp copy_user_handle_tail
.previous
.section __ex_table,"a"
.quad 1b,30b
.quad 2b,30b
.quad 3b,30b
.quad 4b,30b
.quad 5b,30b
.quad 6b,30b
.quad 7b,30b
.quad 8b,30b
.quad 9b,30b
.quad 10b,30b
.quad 11b,30b
.quad 12b,30b
.quad 13b,30b
.quad 14b,30b
.quad 15b,30b
.quad 16b,30b
.quad 18b,40b
.quad 19b,40b
.quad 21b,50b
.quad 22b,50b
.previous
CFI_ENDPROC
ENDPROC(__copy_user_nocache)
+249
View File
@@ -0,0 +1,249 @@
/*
* Copyright 2002, 2003 Andi Kleen, SuSE Labs.
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of this archive
* for more details. No warranty for anything given at all.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/errno.h>
/*
* Checksum copy with exception handling.
* On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
* destination is zeroed.
*
* Input
* rdi source
* rsi destination
* edx len (32bit)
* ecx sum (32bit)
* r8 src_err_ptr (int)
* r9 dst_err_ptr (int)
*
* Output
* eax 64bit sum. undefined in case of exception.
*
* Wrappers need to take care of valid exception sum and zeroing.
* They also should align source or destination to 8 bytes.
*/
.macro source
10:
.section __ex_table, "a"
.align 8
.quad 10b, .Lbad_source
.previous
.endm
.macro dest
20:
.section __ex_table, "a"
.align 8
.quad 20b, .Lbad_dest
.previous
.endm
.macro ignore L=.Lignore
30:
.section __ex_table, "a"
.align 8
.quad 30b, \L
.previous
.endm
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
cmpl $3*64, %edx
jle .Lignore
.Lignore:
subq $7*8, %rsp
CFI_ADJUST_CFA_OFFSET 7*8
movq %rbx, 2*8(%rsp)
CFI_REL_OFFSET rbx, 2*8
movq %r12, 3*8(%rsp)
CFI_REL_OFFSET r12, 3*8
movq %r14, 4*8(%rsp)
CFI_REL_OFFSET r14, 4*8
movq %r13, 5*8(%rsp)
CFI_REL_OFFSET r13, 5*8
movq %rbp, 6*8(%rsp)
CFI_REL_OFFSET rbp, 6*8
movq %r8, (%rsp)
movq %r9, 1*8(%rsp)
movl %ecx, %eax
movl %edx, %ecx
xorl %r9d, %r9d
movq %rcx, %r12
shrq $6, %r12
jz .Lhandle_tail /* < 64 */
clc
/* main loop. clear in 64 byte blocks */
/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
/* r11: temp3, rdx: temp4, r12 loopcnt */
/* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
.p2align 4
.Lloop:
source
movq (%rdi), %rbx
source
movq 8(%rdi), %r8
source
movq 16(%rdi), %r11
source
movq 24(%rdi), %rdx
source
movq 32(%rdi), %r10
source
movq 40(%rdi), %rbp
source
movq 48(%rdi), %r14
source
movq 56(%rdi), %r13
ignore 2f
prefetcht0 5*64(%rdi)
2:
adcq %rbx, %rax
adcq %r8, %rax
adcq %r11, %rax
adcq %rdx, %rax
adcq %r10, %rax
adcq %rbp, %rax
adcq %r14, %rax
adcq %r13, %rax
decl %r12d
dest
movq %rbx, (%rsi)
dest
movq %r8, 8(%rsi)
dest
movq %r11, 16(%rsi)
dest
movq %rdx, 24(%rsi)
dest
movq %r10, 32(%rsi)
dest
movq %rbp, 40(%rsi)
dest
movq %r14, 48(%rsi)
dest
movq %r13, 56(%rsi)
3:
leaq 64(%rdi), %rdi
leaq 64(%rsi), %rsi
jnz .Lloop
adcq %r9, %rax
/* do last up to 56 bytes */
.Lhandle_tail:
/* ecx: count */
movl %ecx, %r10d
andl $63, %ecx
shrl $3, %ecx
jz .Lfold
clc
.p2align 4
.Lloop_8:
source
movq (%rdi), %rbx
adcq %rbx, %rax
decl %ecx
dest
movq %rbx, (%rsi)
leaq 8(%rsi), %rsi /* preserve carry */
leaq 8(%rdi), %rdi
jnz .Lloop_8
adcq %r9, %rax /* add in carry */
.Lfold:
/* reduce checksum to 32bits */
movl %eax, %ebx
shrq $32, %rax
addl %ebx, %eax
adcl %r9d, %eax
/* do last up to 6 bytes */
.Lhandle_7:
movl %r10d, %ecx
andl $7, %ecx
shrl $1, %ecx
jz .Lhandle_1
movl $2, %edx
xorl %ebx, %ebx
clc
.p2align 4
.Lloop_1:
source
movw (%rdi), %bx
adcl %ebx, %eax
decl %ecx
dest
movw %bx, (%rsi)
leaq 2(%rdi), %rdi
leaq 2(%rsi), %rsi
jnz .Lloop_1
adcl %r9d, %eax /* add in carry */
/* handle last odd byte */
.Lhandle_1:
testl $1, %r10d
jz .Lende
xorl %ebx, %ebx
source
movb (%rdi), %bl
dest
movb %bl, (%rsi)
addl %ebx, %eax
adcl %r9d, %eax /* carry */
CFI_REMEMBER_STATE
.Lende:
movq 2*8(%rsp), %rbx
CFI_RESTORE rbx
movq 3*8(%rsp), %r12
CFI_RESTORE r12
movq 4*8(%rsp), %r14
CFI_RESTORE r14
movq 5*8(%rsp), %r13
CFI_RESTORE r13
movq 6*8(%rsp), %rbp
CFI_RESTORE rbp
addq $7*8, %rsp
CFI_ADJUST_CFA_OFFSET -7*8
ret
CFI_RESTORE_STATE
/* Exception handlers. Very simple, zeroing is done in the wrappers */
.Lbad_source:
movq (%rsp), %rax
testq %rax, %rax
jz .Lende
movl $-EFAULT, (%rax)
jmp .Lende
.Lbad_dest:
movq 8(%rsp), %rax
testq %rax, %rax
jz .Lende
movl $-EFAULT, (%rax)
jmp .Lende
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
+148
View File
@@ -0,0 +1,148 @@
/*
* arch/x86_64/lib/csum-partial.c
*
* This file contains network checksum routines that are better done
* in an architecture-specific manner due to speed.
*/
#include <linux/compiler.h>
#include <linux/module.h>
#include <asm/checksum.h>
static inline unsigned short from32to16(unsigned a)
{
unsigned short b = a >> 16;
asm("addw %w2,%w0\n\t"
"adcw $0,%w0\n"
: "=r" (b)
: "0" (b), "r" (a));
return b;
}
/*
* Do a 64-bit checksum on an arbitrary memory area.
* Returns a 32bit checksum.
*
* This isn't as time critical as it used to be because many NICs
* do hardware checksumming these days.
*
* Things tried and found to not make it faster:
* Manual Prefetching
* Unrolling to an 128 bytes inner loop.
* Using interleaving with more registers to break the carry chains.
*/
static unsigned do_csum(const unsigned char *buff, unsigned len)
{
unsigned odd, count;
unsigned long result = 0;
if (unlikely(len == 0))
return result;
odd = 1 & (unsigned long) buff;
if (unlikely(odd)) {
result = *buff << 8;
len--;
buff++;
}
count = len >> 1; /* nr of 16-bit words.. */
if (count) {
if (2 & (unsigned long) buff) {
result += *(unsigned short *)buff;
count--;
len -= 2;
buff += 2;
}
count >>= 1; /* nr of 32-bit words.. */
if (count) {
unsigned long zero;
unsigned count64;
if (4 & (unsigned long) buff) {
result += *(unsigned int *) buff;
count--;
len -= 4;
buff += 4;
}
count >>= 1; /* nr of 64-bit words.. */
/* main loop using 64byte blocks */
zero = 0;
count64 = count >> 3;
while (count64) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq 1*8(%[src]),%[res]\n\t"
"adcq 2*8(%[src]),%[res]\n\t"
"adcq 3*8(%[src]),%[res]\n\t"
"adcq 4*8(%[src]),%[res]\n\t"
"adcq 5*8(%[src]),%[res]\n\t"
"adcq 6*8(%[src]),%[res]\n\t"
"adcq 7*8(%[src]),%[res]\n\t"
"adcq %[zero],%[res]"
: [res] "=r" (result)
: [src] "r" (buff), [zero] "r" (zero),
"[res]" (result));
buff += 64;
count64--;
}
/* last up to 7 8byte blocks */
count %= 8;
while (count) {
asm("addq %1,%0\n\t"
"adcq %2,%0\n"
: "=r" (result)
: "m" (*(unsigned long *)buff),
"r" (zero), "0" (result));
--count;
buff += 8;
}
result = add32_with_carry(result>>32,
result&0xffffffff);
if (len & 4) {
result += *(unsigned int *) buff;
buff += 4;
}
}
if (len & 2) {
result += *(unsigned short *) buff;
buff += 2;
}
}
if (len & 1)
result += *buff;
result = add32_with_carry(result>>32, result & 0xffffffff);
if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
return result;
}
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 64-bit boundary
*/
__wsum csum_partial(const void *buff, int len, __wsum sum)
{
return (__force __wsum)add32_with_carry(do_csum(buff, len),
(__force u32)sum);
}
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
__sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff,len,0));
}
EXPORT_SYMBOL(ip_compute_csum);
+150
View File
@@ -0,0 +1,150 @@
/*
* Copyright 2002, 2003 Andi Kleen, SuSE Labs.
* Subject to the GNU Public License v.2
*
* Wrappers of assembly checksum functions for x86-64.
*/
#include <asm/checksum.h>
#include <linux/module.h>
/**
* csum_partial_copy_from_user - Copy and checksum from user space.
* @src: source address (user space)
* @dst: destination address
* @len: number of bytes to be copied.
* @isum: initial sum that is added into the result (32bit unfolded)
* @errp: set to -EFAULT for an bad source address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
*/
__wsum
csum_partial_copy_from_user(const void __user *src, void *dst,
int len, __wsum isum, int *errp)
{
might_sleep();
*errp = 0;
if (!likely(access_ok(VERIFY_READ, src, len)))
goto out_err;
/*
* Why 6, not 7? To handle odd addresses aligned we
* would need to do considerable complications to fix the
* checksum which is defined as an 16bit accumulator. The
* fix alignment code is primarily for performance
* compatibility with 32bit and that will handle odd
* addresses slowly too.
*/
if (unlikely((unsigned long)src & 6)) {
while (((unsigned long)src & 6) && len >= 2) {
__u16 val16;
*errp = __get_user(val16, (const __u16 __user *)src);
if (*errp)
return isum;
*(__u16 *)dst = val16;
isum = (__force __wsum)add32_with_carry(
(__force unsigned)isum, val16);
src += 2;
dst += 2;
len -= 2;
}
}
isum = csum_partial_copy_generic((__force const void *)src,
dst, len, isum, errp, NULL);
if (unlikely(*errp))
goto out_err;
return isum;
out_err:
*errp = -EFAULT;
memset(dst, 0, len);
return isum;
}
EXPORT_SYMBOL(csum_partial_copy_from_user);
/**
* csum_partial_copy_to_user - Copy and checksum to user space.
* @src: source address
* @dst: destination address (user space)
* @len: number of bytes to be copied.
* @isum: initial sum that is added into the result (32bit unfolded)
* @errp: set to -EFAULT for an bad destination address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
*/
__wsum
csum_partial_copy_to_user(const void *src, void __user *dst,
int len, __wsum isum, int *errp)
{
might_sleep();
if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
*errp = -EFAULT;
return 0;
}
if (unlikely((unsigned long)dst & 6)) {
while (((unsigned long)dst & 6) && len >= 2) {
__u16 val16 = *(__u16 *)src;
isum = (__force __wsum)add32_with_carry(
(__force unsigned)isum, val16);
*errp = __put_user(val16, (__u16 __user *)dst);
if (*errp)
return isum;
src += 2;
dst += 2;
len -= 2;
}
}
*errp = 0;
return csum_partial_copy_generic(src, (void __force *)dst,
len, isum, NULL, errp);
}
EXPORT_SYMBOL(csum_partial_copy_to_user);
/**
* csum_partial_copy_nocheck - Copy and checksum.
* @src: source address
* @dst: destination address
* @len: number of bytes to be copied.
* @isum: initial sum that is added into the result (32bit unfolded)
*
* Returns an 32bit unfolded checksum of the buffer.
*/
__wsum
csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
{
return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
}
EXPORT_SYMBOL(csum_partial_copy_nocheck);
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
__u32 len, unsigned short proto, __wsum sum)
{
__u64 rest, sum64;
rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
(__force __u64)sum;
asm(" addq (%[saddr]),%[sum]\n"
" adcq 8(%[saddr]),%[sum]\n"
" adcq (%[daddr]),%[sum]\n"
" adcq 8(%[daddr]),%[sum]\n"
" adcq $0,%[sum]\n"
: [sum] "=r" (sum64)
: "[sum]" (rest), [saddr] "r" (saddr), [daddr] "r" (daddr));
return csum_fold(
(__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
}
EXPORT_SYMBOL(csum_ipv6_magic);
+140
View File
@@ -0,0 +1,140 @@
/*
* Precise Delay Loops for i386
*
* Copyright (C) 1993 Linus Torvalds
* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
* Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
*
* The __delay function must _NOT_ be inlined as its execution time
* depends wildly on alignment on many x86 processors. The additional
* jump magic is needed to get the timing stable on all the CPU's
* we have to worry about.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/timex.h>
#include <linux/preempt.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <asm/processor.h>
#include <asm/delay.h>
#include <asm/timer.h>
#ifdef CONFIG_SMP
# include <asm/smp.h>
#endif
/* simple loop based delay: */
static void delay_loop(unsigned long loops)
{
asm volatile(
" test %0,%0 \n"
" jz 3f \n"
" jmp 1f \n"
".align 16 \n"
"1: jmp 2f \n"
".align 16 \n"
"2: dec %0 \n"
" jnz 2b \n"
"3: dec %0 \n"
: /* we don't need output */
:"a" (loops)
);
}
/* TSC based delay: */
static void delay_tsc(unsigned long __loops)
{
u32 bclock, now, loops = __loops;
int cpu;
preempt_disable();
cpu = smp_processor_id();
rdtsc_barrier();
rdtscl(bclock);
for (;;) {
rdtsc_barrier();
rdtscl(now);
if ((now - bclock) >= loops)
break;
/* Allow RT tasks to run */
preempt_enable();
rep_nop();
preempt_disable();
/*
* It is possible that we moved to another CPU, and
* since TSC's are per-cpu we need to calculate
* that. The delay must guarantee that we wait "at
* least" the amount of time. Being moved to another
* CPU could make the wait longer but we just need to
* make sure we waited long enough. Rebalance the
* counter for this CPU.
*/
if (unlikely(cpu != smp_processor_id())) {
loops -= (now - bclock);
cpu = smp_processor_id();
rdtsc_barrier();
rdtscl(bclock);
}
}
preempt_enable();
}
/*
* Since we calibrate only once at boot, this
* function should be set once at boot and not changed
*/
static void (*delay_fn)(unsigned long) = delay_loop;
void use_tsc_delay(void)
{
delay_fn = delay_tsc;
}
int __devinit read_current_timer(unsigned long *timer_val)
{
if (delay_fn == delay_tsc) {
rdtscll(*timer_val);
return 0;
}
return -1;
}
void __delay(unsigned long loops)
{
delay_fn(loops);
}
EXPORT_SYMBOL(__delay);
inline void __const_udelay(unsigned long xloops)
{
int d0;
xloops *= 4;
asm("mull %%edx"
:"=d" (xloops), "=&a" (d0)
:"1" (xloops), "0"
(this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
__delay(++xloops);
}
EXPORT_SYMBOL(__const_udelay);
void __udelay(unsigned long usecs)
{
__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long nsecs)
{
__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
EXPORT_SYMBOL(__ndelay);
+104
View File
@@ -0,0 +1,104 @@
/*
* __get_user functions.
*
* (C) Copyright 1998 Linus Torvalds
* (C) Copyright 2005 Andi Kleen
* (C) Copyright 2008 Glauber Costa
*
* These functions have a non-standard call interface
* to make them more efficient, especially as they
* return an error value in addition to the "real"
* return value.
*/
/*
* __get_user_X
*
* Inputs: %[r|e]ax contains the address.
* The register is modified, but all changes are undone
* before returning because the C code doesn't know about it.
*
* Outputs: %[r|e]ax is error code (0 or -EFAULT)
* %[r|e]dx contains zero-extended value
*
*
* These functions should not modify any other registers,
* as they get called from within inline assembly.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/page_types.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/asm.h>
.text
ENTRY(__get_user_1)
CFI_STARTPROC
GET_THREAD_INFO(%_ASM_DX)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
1: movzb (%_ASM_AX),%edx
xor %eax,%eax
ret
CFI_ENDPROC
ENDPROC(__get_user_1)
ENTRY(__get_user_2)
CFI_STARTPROC
add $1,%_ASM_AX
jc bad_get_user
GET_THREAD_INFO(%_ASM_DX)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
2: movzwl -1(%_ASM_AX),%edx
xor %eax,%eax
ret
CFI_ENDPROC
ENDPROC(__get_user_2)
ENTRY(__get_user_4)
CFI_STARTPROC
add $3,%_ASM_AX
jc bad_get_user
GET_THREAD_INFO(%_ASM_DX)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
3: mov -3(%_ASM_AX),%edx
xor %eax,%eax
ret
CFI_ENDPROC
ENDPROC(__get_user_4)
#ifdef CONFIG_X86_64
ENTRY(__get_user_8)
CFI_STARTPROC
add $7,%_ASM_AX
jc bad_get_user
GET_THREAD_INFO(%_ASM_DX)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
4: movq -7(%_ASM_AX),%_ASM_DX
xor %eax,%eax
ret
CFI_ENDPROC
ENDPROC(__get_user_8)
#endif
bad_get_user:
CFI_STARTPROC
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
ret
CFI_ENDPROC
END(bad_get_user)
.section __ex_table,"a"
_ASM_PTR 1b,bad_get_user
_ASM_PTR 2b,bad_get_user
_ASM_PTR 3b,bad_get_user
#ifdef CONFIG_X86_64
_ASM_PTR 4b,bad_get_user
#endif
+97
View File
@@ -0,0 +1,97 @@
/*
* x86 instruction attribute tables
*
* Written by Masami Hiramatsu <mhiramat@redhat.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
#include <asm/insn.h>
/* Attribute tables are generated from opcode map */
#include "inat-tables.c"
/* Attribute search APIs */
insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
{
return inat_primary_table[opcode];
}
int inat_get_last_prefix_id(insn_byte_t last_pfx)
{
insn_attr_t lpfx_attr;
lpfx_attr = inat_get_opcode_attribute(last_pfx);
return inat_last_prefix_id(lpfx_attr);
}
insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
insn_attr_t esc_attr)
{
const insn_attr_t *table;
int n;
n = inat_escape_id(esc_attr);
table = inat_escape_tables[n][0];
if (!table)
return 0;
if (inat_has_variant(table[opcode]) && lpfx_id) {
table = inat_escape_tables[n][lpfx_id];
if (!table)
return 0;
}
return table[opcode];
}
insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
insn_attr_t grp_attr)
{
const insn_attr_t *table;
int n;
n = inat_group_id(grp_attr);
table = inat_group_tables[n][0];
if (!table)
return inat_group_common_attribute(grp_attr);
if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
table = inat_group_tables[n][lpfx_id];
if (!table)
return inat_group_common_attribute(grp_attr);
}
return table[X86_MODRM_REG(modrm)] |
inat_group_common_attribute(grp_attr);
}
insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
insn_byte_t vex_p)
{
const insn_attr_t *table;
if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
return 0;
/* At first, this checks the master table */
table = inat_avx_tables[vex_m][0];
if (!table)
return 0;
if (!inat_is_group(table[opcode]) && vex_p) {
/* If this is not a group, get attribute directly */
table = inat_avx_tables[vex_m][vex_p];
if (!table)
return 0;
}
return table[opcode];
}
+576
View File
@@ -0,0 +1,576 @@
/*
* x86 instruction analysis
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2002, 2004, 2009
*/
#include <linux/string.h>
#include <asm/inat.h>
#include <asm/insn.h>
/* Verify next sizeof(t) bytes can be on the same instruction */
#define validate_next(t, insn, n) \
((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE)
#define __get_next(t, insn) \
({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
#define __peek_nbyte_next(t, insn, n) \
({ t r = *(t*)((insn)->next_byte + n); r; })
#define get_next(t, insn) \
({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
#define peek_nbyte_next(t, insn, n) \
({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); })
#define peek_next(t, insn) peek_nbyte_next(t, insn, 0)
/**
* insn_init() - initialize struct insn
* @insn: &struct insn to be initialized
* @kaddr: address (in kernel memory) of instruction (or copy thereof)
* @x86_64: !0 for 64-bit kernel or 64-bit app
*/
void insn_init(struct insn *insn, const void *kaddr, int x86_64)
{
memset(insn, 0, sizeof(*insn));
insn->kaddr = kaddr;
insn->next_byte = kaddr;
insn->x86_64 = x86_64 ? 1 : 0;
insn->opnd_bytes = 4;
if (x86_64)
insn->addr_bytes = 8;
else
insn->addr_bytes = 4;
}
/**
* insn_get_prefixes - scan x86 instruction prefix bytes
* @insn: &struct insn containing instruction
*
* Populates the @insn->prefixes bitmap, and updates @insn->next_byte
* to point to the (first) opcode. No effect if @insn->prefixes.got
* is already set.
*/
void insn_get_prefixes(struct insn *insn)
{
struct insn_field *prefixes = &insn->prefixes;
insn_attr_t attr;
insn_byte_t b, lb;
int i, nb;
if (prefixes->got)
return;
nb = 0;
lb = 0;
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
while (inat_is_legacy_prefix(attr)) {
/* Skip if same prefix */
for (i = 0; i < nb; i++)
if (prefixes->bytes[i] == b)
goto found;
if (nb == 4)
/* Invalid instruction */
break;
prefixes->bytes[nb++] = b;
if (inat_is_address_size_prefix(attr)) {
/* address size switches 2/4 or 4/8 */
if (insn->x86_64)
insn->addr_bytes ^= 12;
else
insn->addr_bytes ^= 6;
} else if (inat_is_operand_size_prefix(attr)) {
/* oprand size switches 2/4 */
insn->opnd_bytes ^= 6;
}
found:
prefixes->nbytes++;
insn->next_byte++;
lb = b;
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
}
/* Set the last prefix */
if (lb && lb != insn->prefixes.bytes[3]) {
if (unlikely(insn->prefixes.bytes[3])) {
/* Swap the last prefix */
b = insn->prefixes.bytes[3];
for (i = 0; i < nb; i++)
if (prefixes->bytes[i] == lb)
prefixes->bytes[i] = b;
}
insn->prefixes.bytes[3] = lb;
}
/* Decode REX prefix */
if (insn->x86_64) {
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
if (inat_is_rex_prefix(attr)) {
insn->rex_prefix.value = b;
insn->rex_prefix.nbytes = 1;
insn->next_byte++;
if (X86_REX_W(b))
/* REX.W overrides opnd_size */
insn->opnd_bytes = 8;
}
}
insn->rex_prefix.got = 1;
/* Decode VEX prefix */
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
if (inat_is_vex_prefix(attr)) {
insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
if (!insn->x86_64) {
/*
* In 32-bits mode, if the [7:6] bits (mod bits of
* ModRM) on the second byte are not 11b, it is
* LDS or LES.
*/
if (X86_MODRM_MOD(b2) != 3)
goto vex_end;
}
insn->vex_prefix.bytes[0] = b;
insn->vex_prefix.bytes[1] = b2;
if (inat_is_vex3_prefix(attr)) {
b2 = peek_nbyte_next(insn_byte_t, insn, 2);
insn->vex_prefix.bytes[2] = b2;
insn->vex_prefix.nbytes = 3;
insn->next_byte += 3;
if (insn->x86_64 && X86_VEX_W(b2))
/* VEX.W overrides opnd_size */
insn->opnd_bytes = 8;
} else {
insn->vex_prefix.nbytes = 2;
insn->next_byte += 2;
}
}
vex_end:
insn->vex_prefix.got = 1;
prefixes->got = 1;
err_out:
return;
}
/**
* insn_get_opcode - collect opcode(s)
* @insn: &struct insn containing instruction
*
* Populates @insn->opcode, updates @insn->next_byte to point past the
* opcode byte(s), and set @insn->attr (except for groups).
* If necessary, first collects any preceding (prefix) bytes.
* Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
* is already 1.
*/
void insn_get_opcode(struct insn *insn)
{
struct insn_field *opcode = &insn->opcode;
insn_byte_t op;
int pfx_id;
if (opcode->got)
return;
if (!insn->prefixes.got)
insn_get_prefixes(insn);
/* Get first opcode */
op = get_next(insn_byte_t, insn);
opcode->bytes[0] = op;
opcode->nbytes = 1;
/* Check if there is VEX prefix or not */
if (insn_is_avx(insn)) {
insn_byte_t m, p;
m = insn_vex_m_bits(insn);
p = insn_vex_p_bits(insn);
insn->attr = inat_get_avx_attribute(op, m, p);
if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
insn->attr = 0; /* This instruction is bad */
goto end; /* VEX has only 1 byte for opcode */
}
insn->attr = inat_get_opcode_attribute(op);
while (inat_is_escape(insn->attr)) {
/* Get escaped opcode */
op = get_next(insn_byte_t, insn);
opcode->bytes[opcode->nbytes++] = op;
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
}
if (inat_must_vex(insn->attr))
insn->attr = 0; /* This instruction is bad */
end:
opcode->got = 1;
err_out:
return;
}
/**
* insn_get_modrm - collect ModRM byte, if any
* @insn: &struct insn containing instruction
*
* Populates @insn->modrm and updates @insn->next_byte to point past the
* ModRM byte, if any. If necessary, first collects the preceding bytes
* (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
*/
void insn_get_modrm(struct insn *insn)
{
struct insn_field *modrm = &insn->modrm;
insn_byte_t pfx_id, mod;
if (modrm->got)
return;
if (!insn->opcode.got)
insn_get_opcode(insn);
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
modrm->value = mod;
modrm->nbytes = 1;
if (inat_is_group(insn->attr)) {
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_group_attribute(mod, pfx_id,
insn->attr);
if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
insn->attr = 0; /* This is bad */
}
}
if (insn->x86_64 && inat_is_force64(insn->attr))
insn->opnd_bytes = 8;
modrm->got = 1;
err_out:
return;
}
/**
* insn_rip_relative() - Does instruction use RIP-relative addressing mode?
* @insn: &struct insn containing instruction
*
* If necessary, first collects the instruction up to and including the
* ModRM byte. No effect if @insn->x86_64 is 0.
*/
int insn_rip_relative(struct insn *insn)
{
struct insn_field *modrm = &insn->modrm;
if (!insn->x86_64)
return 0;
if (!modrm->got)
insn_get_modrm(insn);
/*
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
*/
return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
}
/**
* insn_get_sib() - Get the SIB byte of instruction
* @insn: &struct insn containing instruction
*
* If necessary, first collects the instruction up to and including the
* ModRM byte.
*/
void insn_get_sib(struct insn *insn)
{
insn_byte_t modrm;
if (insn->sib.got)
return;
if (!insn->modrm.got)
insn_get_modrm(insn);
if (insn->modrm.nbytes) {
modrm = (insn_byte_t)insn->modrm.value;
if (insn->addr_bytes != 2 &&
X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
insn->sib.value = get_next(insn_byte_t, insn);
insn->sib.nbytes = 1;
}
}
insn->sib.got = 1;
err_out:
return;
}
/**
* insn_get_displacement() - Get the displacement of instruction
* @insn: &struct insn containing instruction
*
* If necessary, first collects the instruction up to and including the
* SIB byte.
* Displacement value is sign-expanded.
*/
void insn_get_displacement(struct insn *insn)
{
insn_byte_t mod, rm, base;
if (insn->displacement.got)
return;
if (!insn->sib.got)
insn_get_sib(insn);
if (insn->modrm.nbytes) {
/*
* Interpreting the modrm byte:
* mod = 00 - no displacement fields (exceptions below)
* mod = 01 - 1-byte displacement field
* mod = 10 - displacement field is 4 bytes, or 2 bytes if
* address size = 2 (0x67 prefix in 32-bit mode)
* mod = 11 - no memory operand
*
* If address size = 2...
* mod = 00, r/m = 110 - displacement field is 2 bytes
*
* If address size != 2...
* mod != 11, r/m = 100 - SIB byte exists
* mod = 00, SIB base = 101 - displacement field is 4 bytes
* mod = 00, r/m = 101 - rip-relative addressing, displacement
* field is 4 bytes
*/
mod = X86_MODRM_MOD(insn->modrm.value);
rm = X86_MODRM_RM(insn->modrm.value);
base = X86_SIB_BASE(insn->sib.value);
if (mod == 3)
goto out;
if (mod == 1) {
insn->displacement.value = get_next(char, insn);
insn->displacement.nbytes = 1;
} else if (insn->addr_bytes == 2) {
if ((mod == 0 && rm == 6) || mod == 2) {
insn->displacement.value =
get_next(short, insn);
insn->displacement.nbytes = 2;
}
} else {
if ((mod == 0 && rm == 5) || mod == 2 ||
(mod == 0 && base == 5)) {
insn->displacement.value = get_next(int, insn);
insn->displacement.nbytes = 4;
}
}
}
out:
insn->displacement.got = 1;
err_out:
return;
}
/* Decode moffset16/32/64. Return 0 if failed */
static int __get_moffset(struct insn *insn)
{
switch (insn->addr_bytes) {
case 2:
insn->moffset1.value = get_next(short, insn);
insn->moffset1.nbytes = 2;
break;
case 4:
insn->moffset1.value = get_next(int, insn);
insn->moffset1.nbytes = 4;
break;
case 8:
insn->moffset1.value = get_next(int, insn);
insn->moffset1.nbytes = 4;
insn->moffset2.value = get_next(int, insn);
insn->moffset2.nbytes = 4;
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
}
insn->moffset1.got = insn->moffset2.got = 1;
return 1;
err_out:
return 0;
}
/* Decode imm v32(Iz). Return 0 if failed */
static int __get_immv32(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
insn->immediate.value = get_next(short, insn);
insn->immediate.nbytes = 2;
break;
case 4:
case 8:
insn->immediate.value = get_next(int, insn);
insn->immediate.nbytes = 4;
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
}
return 1;
err_out:
return 0;
}
/* Decode imm v64(Iv/Ov), Return 0 if failed */
static int __get_immv(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
insn->immediate1.value = get_next(short, insn);
insn->immediate1.nbytes = 2;
break;
case 4:
insn->immediate1.value = get_next(int, insn);
insn->immediate1.nbytes = 4;
break;
case 8:
insn->immediate1.value = get_next(int, insn);
insn->immediate1.nbytes = 4;
insn->immediate2.value = get_next(int, insn);
insn->immediate2.nbytes = 4;
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
}
insn->immediate1.got = insn->immediate2.got = 1;
return 1;
err_out:
return 0;
}
/* Decode ptr16:16/32(Ap) */
static int __get_immptr(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
insn->immediate1.value = get_next(short, insn);
insn->immediate1.nbytes = 2;
break;
case 4:
insn->immediate1.value = get_next(int, insn);
insn->immediate1.nbytes = 4;
break;
case 8:
/* ptr16:64 is not exist (no segment) */
return 0;
default: /* opnd_bytes must be modified manually */
goto err_out;
}
insn->immediate2.value = get_next(unsigned short, insn);
insn->immediate2.nbytes = 2;
insn->immediate1.got = insn->immediate2.got = 1;
return 1;
err_out:
return 0;
}
/**
* insn_get_immediate() - Get the immediates of instruction
* @insn: &struct insn containing instruction
*
* If necessary, first collects the instruction up to and including the
* displacement bytes.
* Basically, most of immediates are sign-expanded. Unsigned-value can be
* get by bit masking with ((1 << (nbytes * 8)) - 1)
*/
void insn_get_immediate(struct insn *insn)
{
if (insn->immediate.got)
return;
if (!insn->displacement.got)
insn_get_displacement(insn);
if (inat_has_moffset(insn->attr)) {
if (!__get_moffset(insn))
goto err_out;
goto done;
}
if (!inat_has_immediate(insn->attr))
/* no immediates */
goto done;
switch (inat_immediate_size(insn->attr)) {
case INAT_IMM_BYTE:
insn->immediate.value = get_next(char, insn);
insn->immediate.nbytes = 1;
break;
case INAT_IMM_WORD:
insn->immediate.value = get_next(short, insn);
insn->immediate.nbytes = 2;
break;
case INAT_IMM_DWORD:
insn->immediate.value = get_next(int, insn);
insn->immediate.nbytes = 4;
break;
case INAT_IMM_QWORD:
insn->immediate1.value = get_next(int, insn);
insn->immediate1.nbytes = 4;
insn->immediate2.value = get_next(int, insn);
insn->immediate2.nbytes = 4;
break;
case INAT_IMM_PTR:
if (!__get_immptr(insn))
goto err_out;
break;
case INAT_IMM_VWORD32:
if (!__get_immv32(insn))
goto err_out;
break;
case INAT_IMM_VWORD:
if (!__get_immv(insn))
goto err_out;
break;
default:
/* Here, insn must have an immediate, but failed */
goto err_out;
}
if (inat_has_second_immediate(insn->attr)) {
insn->immediate2.value = get_next(char, insn);
insn->immediate2.nbytes = 1;
}
done:
insn->immediate.got = 1;
err_out:
return;
}
/**
* insn_get_length() - Get the length of instruction
* @insn: &struct insn containing instruction
*
* If necessary, first collects the instruction up to and including the
* immediates bytes.
*/
void insn_get_length(struct insn *insn)
{
if (insn->length)
return;
if (!insn->immediate.got)
insn_get_immediate(insn);
insn->length = (unsigned char)((unsigned long)insn->next_byte
- (unsigned long)insn->kaddr);
}
+30
View File
@@ -0,0 +1,30 @@
/*
* Copyright 2006 PathScale, Inc. All Rights Reserved.
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
/*
* override generic version in lib/iomap_copy.c
*/
ENTRY(__iowrite32_copy)
CFI_STARTPROC
movl %edx,%ecx
rep movsd
ret
CFI_ENDPROC
ENDPROC(__iowrite32_copy)
+208
View File
@@ -0,0 +1,208 @@
#include <linux/string.h>
#include <linux/module.h>
#undef memcpy
#undef memset
void *memcpy(void *to, const void *from, size_t n)
{
#ifdef CONFIG_X86_USE_3DNOW
return __memcpy3d(to, from, n);
#else
return __memcpy(to, from, n);
#endif
}
EXPORT_SYMBOL(memcpy);
void *memset(void *s, int c, size_t count)
{
return __memset(s, c, count);
}
EXPORT_SYMBOL(memset);
void *memmove(void *dest, const void *src, size_t n)
{
int d0,d1,d2,d3,d4,d5;
char *ret = dest;
__asm__ __volatile__(
/* Handle more 16bytes in loop */
"cmp $0x10, %0\n\t"
"jb 1f\n\t"
/* Decide forward/backward copy mode */
"cmp %2, %1\n\t"
"jb 2f\n\t"
/*
* movs instruction have many startup latency
* so we handle small size by general register.
*/
"cmp $680, %0\n\t"
"jb 3f\n\t"
/*
* movs instruction is only good for aligned case.
*/
"mov %1, %3\n\t"
"xor %2, %3\n\t"
"and $0xff, %3\n\t"
"jz 4f\n\t"
"3:\n\t"
"sub $0x10, %0\n\t"
/*
* We gobble 16byts forward in each loop.
*/
"3:\n\t"
"sub $0x10, %0\n\t"
"mov 0*4(%1), %3\n\t"
"mov 1*4(%1), %4\n\t"
"mov %3, 0*4(%2)\n\t"
"mov %4, 1*4(%2)\n\t"
"mov 2*4(%1), %3\n\t"
"mov 3*4(%1), %4\n\t"
"mov %3, 2*4(%2)\n\t"
"mov %4, 3*4(%2)\n\t"
"lea 0x10(%1), %1\n\t"
"lea 0x10(%2), %2\n\t"
"jae 3b\n\t"
"add $0x10, %0\n\t"
"jmp 1f\n\t"
/*
* Handle data forward by movs.
*/
".p2align 4\n\t"
"4:\n\t"
"mov -4(%1, %0), %3\n\t"
"lea -4(%2, %0), %4\n\t"
"shr $2, %0\n\t"
"rep movsl\n\t"
"mov %3, (%4)\n\t"
"jmp 11f\n\t"
/*
* Handle data backward by movs.
*/
".p2align 4\n\t"
"6:\n\t"
"mov (%1), %3\n\t"
"mov %2, %4\n\t"
"lea -4(%1, %0), %1\n\t"
"lea -4(%2, %0), %2\n\t"
"shr $2, %0\n\t"
"std\n\t"
"rep movsl\n\t"
"mov %3,(%4)\n\t"
"cld\n\t"
"jmp 11f\n\t"
/*
* Start to prepare for backward copy.
*/
".p2align 4\n\t"
"2:\n\t"
"cmp $680, %0\n\t"
"jb 5f\n\t"
"mov %1, %3\n\t"
"xor %2, %3\n\t"
"and $0xff, %3\n\t"
"jz 6b\n\t"
/*
* Calculate copy position to tail.
*/
"5:\n\t"
"add %0, %1\n\t"
"add %0, %2\n\t"
"sub $0x10, %0\n\t"
/*
* We gobble 16byts backward in each loop.
*/
"7:\n\t"
"sub $0x10, %0\n\t"
"mov -1*4(%1), %3\n\t"
"mov -2*4(%1), %4\n\t"
"mov %3, -1*4(%2)\n\t"
"mov %4, -2*4(%2)\n\t"
"mov -3*4(%1), %3\n\t"
"mov -4*4(%1), %4\n\t"
"mov %3, -3*4(%2)\n\t"
"mov %4, -4*4(%2)\n\t"
"lea -0x10(%1), %1\n\t"
"lea -0x10(%2), %2\n\t"
"jae 7b\n\t"
/*
* Calculate copy position to head.
*/
"add $0x10, %0\n\t"
"sub %0, %1\n\t"
"sub %0, %2\n\t"
/*
* Move data from 8 bytes to 15 bytes.
*/
".p2align 4\n\t"
"1:\n\t"
"cmp $8, %0\n\t"
"jb 8f\n\t"
"mov 0*4(%1), %3\n\t"
"mov 1*4(%1), %4\n\t"
"mov -2*4(%1, %0), %5\n\t"
"mov -1*4(%1, %0), %1\n\t"
"mov %3, 0*4(%2)\n\t"
"mov %4, 1*4(%2)\n\t"
"mov %5, -2*4(%2, %0)\n\t"
"mov %1, -1*4(%2, %0)\n\t"
"jmp 11f\n\t"
/*
* Move data from 4 bytes to 7 bytes.
*/
".p2align 4\n\t"
"8:\n\t"
"cmp $4, %0\n\t"
"jb 9f\n\t"
"mov 0*4(%1), %3\n\t"
"mov -1*4(%1, %0), %4\n\t"
"mov %3, 0*4(%2)\n\t"
"mov %4, -1*4(%2, %0)\n\t"
"jmp 11f\n\t"
/*
* Move data from 2 bytes to 3 bytes.
*/
".p2align 4\n\t"
"9:\n\t"
"cmp $2, %0\n\t"
"jb 10f\n\t"
"movw 0*2(%1), %%dx\n\t"
"movw -1*2(%1, %0), %%bx\n\t"
"movw %%dx, 0*2(%2)\n\t"
"movw %%bx, -1*2(%2, %0)\n\t"
"jmp 11f\n\t"
/*
* Move data for 1 byte.
*/
".p2align 4\n\t"
"10:\n\t"
"cmp $1, %0\n\t"
"jb 11f\n\t"
"movb (%1), %%cl\n\t"
"movb %%cl, (%2)\n\t"
".p2align 4\n\t"
"11:"
: "=&c" (d0), "=&S" (d1), "=&D" (d2),
"=r" (d3),"=r" (d4), "=r"(d5)
:"0" (n),
"1" (src),
"2" (dest)
:"memory");
return ret;
}
EXPORT_SYMBOL(memmove);
+206
View File
@@ -0,0 +1,206 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/*
* memcpy - Copy a memory block.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* rax original destination
*/
/*
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
*
* This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
movq %rdi, %rax
movq %rdx, %rcx
shrq $3, %rcx
andl $7, %edx
rep movsq
movl %edx, %ecx
rep movsb
ret
.Lmemcpy_e:
.previous
/*
* memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
* memcpy_c. Use memcpy_c_e when possible.
*
* This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c_e:
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
ret
.Lmemcpy_e_e:
.previous
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
movq %rdi, %rax
cmpq $0x20, %rdx
jb .Lhandle_tail
/*
* We check whether memory false dependence could occur,
* then jump to corresponding copy mode.
*/
cmp %dil, %sil
jl .Lcopy_backward
subq $0x20, %rdx
.Lcopy_forward_loop:
subq $0x20, %rdx
/*
* Move in blocks of 4x8 bytes:
*/
movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r9
movq 2*8(%rsi), %r10
movq 3*8(%rsi), %r11
leaq 4*8(%rsi), %rsi
movq %r8, 0*8(%rdi)
movq %r9, 1*8(%rdi)
movq %r10, 2*8(%rdi)
movq %r11, 3*8(%rdi)
leaq 4*8(%rdi), %rdi
jae .Lcopy_forward_loop
addl $0x20, %edx
jmp .Lhandle_tail
.Lcopy_backward:
/*
* Calculate copy position to tail.
*/
addq %rdx, %rsi
addq %rdx, %rdi
subq $0x20, %rdx
/*
* At most 3 ALU operations in one cycle,
* so append NOPS in the same 16bytes trunk.
*/
.p2align 4
.Lcopy_backward_loop:
subq $0x20, %rdx
movq -1*8(%rsi), %r8
movq -2*8(%rsi), %r9
movq -3*8(%rsi), %r10
movq -4*8(%rsi), %r11
leaq -4*8(%rsi), %rsi
movq %r8, -1*8(%rdi)
movq %r9, -2*8(%rdi)
movq %r10, -3*8(%rdi)
movq %r11, -4*8(%rdi)
leaq -4*8(%rdi), %rdi
jae .Lcopy_backward_loop
/*
* Calculate copy position to head.
*/
addl $0x20, %edx
subq %rdx, %rsi
subq %rdx, %rdi
.Lhandle_tail:
cmpl $16, %edx
jb .Lless_16bytes
/*
* Move data from 16 bytes to 31 bytes.
*/
movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r9
movq -2*8(%rsi, %rdx), %r10
movq -1*8(%rsi, %rdx), %r11
movq %r8, 0*8(%rdi)
movq %r9, 1*8(%rdi)
movq %r10, -2*8(%rdi, %rdx)
movq %r11, -1*8(%rdi, %rdx)
retq
.p2align 4
.Lless_16bytes:
cmpl $8, %edx
jb .Lless_8bytes
/*
* Move data from 8 bytes to 15 bytes.
*/
movq 0*8(%rsi), %r8
movq -1*8(%rsi, %rdx), %r9
movq %r8, 0*8(%rdi)
movq %r9, -1*8(%rdi, %rdx)
retq
.p2align 4
.Lless_8bytes:
cmpl $4, %edx
jb .Lless_3bytes
/*
* Move data from 4 bytes to 7 bytes.
*/
movl (%rsi), %ecx
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
retq
.p2align 4
.Lless_3bytes:
subl $1, %edx
jb .Lend
/*
* Move data from 1 bytes to 3 bytes.
*/
movzbl (%rsi), %ecx
jz .Lstore_1byte
movzbq 1(%rsi), %r8
movzbq (%rsi, %rdx), %r9
movb %r8b, 1(%rdi)
movb %r9b, (%rdi, %rdx)
.Lstore_1byte:
movb %cl, (%rdi)
.Lend:
retq
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
* Some CPUs are adding enhanced REP MOVSB/STOSB feature
* If the feature is supported, memcpy_c_e() is the first choice.
* If enhanced rep movsb copy is not available, use fast string copy
* memcpy_c() when possible. This is faster and code is simpler than
* original memcpy().
* Otherwise, original memcpy() is used.
* In .altinstructions section, ERMS feature is placed after REG_GOOD
* feature to implement the right patch order.
*
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
.section .altinstructions, "a"
altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
+223
View File
@@ -0,0 +1,223 @@
/*
* Normally compiler builtins are used, but sometimes the compiler calls out
* of line code. Based on asm-i386/string.h.
*
* This assembly file is re-written from memmove_64.c file.
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/
#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
#undef memmove
/*
* Implement memmove(). This can handle overlap between src and dst.
*
* Input:
* rdi: dest
* rsi: src
* rdx: count
*
* Output:
* rax: dest
*/
ENTRY(memmove)
CFI_STARTPROC
/* Handle more 32bytes in loop */
mov %rdi, %rax
cmp $0x20, %rdx
jb 1f
/* Decide forward/backward copy mode */
cmp %rdi, %rsi
jge .Lmemmove_begin_forward
mov %rsi, %r8
add %rdx, %r8
cmp %rdi, %r8
jg 2f
.Lmemmove_begin_forward:
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
*/
cmp $680, %rdx
jb 3f
/*
* movsq instruction is only good for aligned case.
*/
cmpb %dil, %sil
je 4f
3:
sub $0x20, %rdx
/*
* We gobble 32byts forward in each loop.
*/
5:
sub $0x20, %rdx
movq 0*8(%rsi), %r11
movq 1*8(%rsi), %r10
movq 2*8(%rsi), %r9
movq 3*8(%rsi), %r8
leaq 4*8(%rsi), %rsi
movq %r11, 0*8(%rdi)
movq %r10, 1*8(%rdi)
movq %r9, 2*8(%rdi)
movq %r8, 3*8(%rdi)
leaq 4*8(%rdi), %rdi
jae 5b
addq $0x20, %rdx
jmp 1f
/*
* Handle data forward by movsq.
*/
.p2align 4
4:
movq %rdx, %rcx
movq -8(%rsi, %rdx), %r11
lea -8(%rdi, %rdx), %r10
shrq $3, %rcx
rep movsq
movq %r11, (%r10)
jmp 13f
.Lmemmove_end_forward:
/*
* Handle data backward by movsq.
*/
.p2align 4
7:
movq %rdx, %rcx
movq (%rsi), %r11
movq %rdi, %r10
leaq -8(%rsi, %rdx), %rsi
leaq -8(%rdi, %rdx), %rdi
shrq $3, %rcx
std
rep movsq
cld
movq %r11, (%r10)
jmp 13f
/*
* Start to prepare for backward copy.
*/
.p2align 4
2:
cmp $680, %rdx
jb 6f
cmp %dil, %sil
je 7b
6:
/*
* Calculate copy position to tail.
*/
addq %rdx, %rsi
addq %rdx, %rdi
subq $0x20, %rdx
/*
* We gobble 32byts backward in each loop.
*/
8:
subq $0x20, %rdx
movq -1*8(%rsi), %r11
movq -2*8(%rsi), %r10
movq -3*8(%rsi), %r9
movq -4*8(%rsi), %r8
leaq -4*8(%rsi), %rsi
movq %r11, -1*8(%rdi)
movq %r10, -2*8(%rdi)
movq %r9, -3*8(%rdi)
movq %r8, -4*8(%rdi)
leaq -4*8(%rdi), %rdi
jae 8b
/*
* Calculate copy position to head.
*/
addq $0x20, %rdx
subq %rdx, %rsi
subq %rdx, %rdi
1:
cmpq $16, %rdx
jb 9f
/*
* Move data from 16 bytes to 31 bytes.
*/
movq 0*8(%rsi), %r11
movq 1*8(%rsi), %r10
movq -2*8(%rsi, %rdx), %r9
movq -1*8(%rsi, %rdx), %r8
movq %r11, 0*8(%rdi)
movq %r10, 1*8(%rdi)
movq %r9, -2*8(%rdi, %rdx)
movq %r8, -1*8(%rdi, %rdx)
jmp 13f
.p2align 4
9:
cmpq $8, %rdx
jb 10f
/*
* Move data from 8 bytes to 15 bytes.
*/
movq 0*8(%rsi), %r11
movq -1*8(%rsi, %rdx), %r10
movq %r11, 0*8(%rdi)
movq %r10, -1*8(%rdi, %rdx)
jmp 13f
10:
cmpq $4, %rdx
jb 11f
/*
* Move data from 4 bytes to 7 bytes.
*/
movl (%rsi), %r11d
movl -4(%rsi, %rdx), %r10d
movl %r11d, (%rdi)
movl %r10d, -4(%rdi, %rdx)
jmp 13f
11:
cmp $2, %rdx
jb 12f
/*
* Move data from 2 bytes to 3 bytes.
*/
movw (%rsi), %r11w
movw -2(%rsi, %rdx), %r10w
movw %r11w, (%rdi)
movw %r10w, -2(%rdi, %rdx)
jmp 13f
12:
cmp $1, %rdx
jb 13f
/*
* Move data for 1 byte.
*/
movb (%rsi), %r11b
movb %r11b, (%rdi)
13:
retq
CFI_ENDPROC
.section .altinstr_replacement,"ax"
.Lmemmove_begin_forward_efs:
/* Forward moving data. */
movq %rdx, %rcx
rep movsb
retq
.Lmemmove_end_forward_efs:
.previous
.section .altinstructions,"a"
altinstruction_entry .Lmemmove_begin_forward, \
.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
.Lmemmove_end_forward-.Lmemmove_begin_forward, \
.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
.previous
ENDPROC(memmove)
+154
View File
@@ -0,0 +1,154 @@
/* Copyright 2002 Andi Kleen, SuSE Labs */
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
* simpler and shorter than the orignal function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemset_c:
movq %rdi,%r9
movq %rdx,%rcx
andl $7,%edx
shrq $3,%rcx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
imulq %rsi,%rax
rep stosq
movl %edx,%ecx
rep stosb
movq %r9,%rax
ret
.Lmemset_e:
.previous
/*
* ISO C memset - set a memory block to a byte value. This function uses
* enhanced rep stosb to override the fast string function.
* The code is simpler and shorter than the fast string function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemset_c_e:
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
ret
.Lmemset_e_e:
.previous
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
movq %rdi,%r10
/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
imulq %rcx,%rax
/* align dst */
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
CFI_REMEMBER_STATE
.Lafter_bad_alignment:
movq %rdx,%rcx
shrq $6,%rcx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
decq %rcx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
leaq 64(%rdi),%rdi
jnz .Lloop_64
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
.Lhandle_tail:
movl %edx,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
.p2align 4
.Lloop_8:
decl %ecx
movq %rax,(%rdi)
leaq 8(%rdi),%rdi
jnz .Lloop_8
.Lhandle_7:
andl $7,%edx
jz .Lende
.p2align 4
.Lloop_1:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1
.Lende:
movq %r10,%rax
ret
CFI_RESTORE_STATE
.Lbad_alignment:
cmpq $7,%rdx
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
jmp .Lafter_bad_alignment
.Lfinal:
CFI_ENDPROC
ENDPROC(memset)
ENDPROC(__memset)
/* Some CPUs support enhanced REP MOVSB/STOSB feature.
* It is recommended to use this when possible.
*
* If enhanced REP MOVSB/STOSB feature is not available, use fast string
* instructions.
*
* Otherwise, use original memset function.
*
* In .altinstructions section, ERMS feature is placed after REG_GOOD
* feature to implement the right patch order.
*/
.section .altinstructions,"a"
altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
.Lfinal-memset,.Lmemset_e-.Lmemset_c
altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
.Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
.previous
+377
View File
@@ -0,0 +1,377 @@
/*
* MMX 3DNow! library helper functions
*
* To do:
* We can use MMX just for prefetch in IRQ's. This may be a win.
* (reported so on K6-III)
* We should use a better code neutral filler for the short jump
* leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
* We also want to clobber the filler register so we don't get any
* register forwarding stalls on the filler.
*
* Add *user handling. Checksums are not a win with MMX on any CPU
* tested so far for any MMX solution figured.
*
* 22/09/2000 - Arjan van de Ven
* Improved for non-egineering-sample Athlons
*
*/
#include <linux/hardirq.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <asm/i387.h>
#include <asm/asm.h>
void *_mmx_memcpy(void *to, const void *from, size_t len)
{
void *p;
int i;
if (unlikely(in_interrupt()))
return __memcpy(to, from, len);
p = to;
i = len >> 6; /* len/64 */
kernel_fpu_begin();
__asm__ __volatile__ (
"1: prefetch (%0)\n" /* This set is 28 bytes */
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
" prefetch 192(%0)\n"
" prefetch 256(%0)\n"
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b)
: : "r" (from));
for ( ; i > 5; i--) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b)
: : "r" (from), "r" (to) : "memory");
from += 64;
to += 64;
}
for ( ; i > 0; i--) {
__asm__ __volatile__ (
" movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from += 64;
to += 64;
}
/*
* Now do the tail of the block:
*/
__memcpy(to, from, len & 63);
kernel_fpu_end();
return p;
}
EXPORT_SYMBOL(_mmx_memcpy);
#ifdef CONFIG_MK7
/*
* The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
* other MMX using processors do not.
*/
static void fast_clear_page(void *page)
{
int i;
kernel_fpu_begin();
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);
for (i = 0; i < 4096/64; i++) {
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
" movntq %%mm0, 8(%0)\n"
" movntq %%mm0, 16(%0)\n"
" movntq %%mm0, 24(%0)\n"
" movntq %%mm0, 32(%0)\n"
" movntq %%mm0, 40(%0)\n"
" movntq %%mm0, 48(%0)\n"
" movntq %%mm0, 56(%0)\n"
: : "r" (page) : "memory");
page += 64;
}
/*
* Since movntq is weakly-ordered, a "sfence" is needed to become
* ordered again:
*/
__asm__ __volatile__("sfence\n"::);
kernel_fpu_end();
}
static void fast_copy_page(void *to, void *from)
{
int i;
kernel_fpu_begin();
/*
* maybe the prefetch stuff can go before the expensive fnsave...
* but that is for later. -AV
*/
__asm__ __volatile__(
"1: prefetch (%0)\n"
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
" prefetch 192(%0)\n"
" prefetch 256(%0)\n"
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b) : : "r" (from));
for (i = 0; i < (4096-320)/64; i++) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
from += 64;
to += 64;
}
for (i = (4096-320)/64; i < 4096/64; i++) {
__asm__ __volatile__ (
"2: movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from += 64;
to += 64;
}
/*
* Since movntq is weakly-ordered, a "sfence" is needed to become
* ordered again:
*/
__asm__ __volatile__("sfence \n"::);
kernel_fpu_end();
}
#else /* CONFIG_MK7 */
/*
* Generic MMX implementation without K7 specific streaming
*/
static void fast_clear_page(void *page)
{
int i;
kernel_fpu_begin();
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);
for (i = 0; i < 4096/128; i++) {
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
" movq %%mm0, 8(%0)\n"
" movq %%mm0, 16(%0)\n"
" movq %%mm0, 24(%0)\n"
" movq %%mm0, 32(%0)\n"
" movq %%mm0, 40(%0)\n"
" movq %%mm0, 48(%0)\n"
" movq %%mm0, 56(%0)\n"
" movq %%mm0, 64(%0)\n"
" movq %%mm0, 72(%0)\n"
" movq %%mm0, 80(%0)\n"
" movq %%mm0, 88(%0)\n"
" movq %%mm0, 96(%0)\n"
" movq %%mm0, 104(%0)\n"
" movq %%mm0, 112(%0)\n"
" movq %%mm0, 120(%0)\n"
: : "r" (page) : "memory");
page += 128;
}
kernel_fpu_end();
}
static void fast_copy_page(void *to, void *from)
{
int i;
kernel_fpu_begin();
__asm__ __volatile__ (
"1: prefetch (%0)\n"
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
" prefetch 192(%0)\n"
" prefetch 256(%0)\n"
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b) : : "r" (from));
for (i = 0; i < 4096/64; i++) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b)
: : "r" (from), "r" (to) : "memory");
from += 64;
to += 64;
}
kernel_fpu_end();
}
#endif /* !CONFIG_MK7 */
/*
* Favour MMX for page clear and copy:
*/
static void slow_zero_page(void *page)
{
int d0, d1;
__asm__ __volatile__(
"cld\n\t"
"rep ; stosl"
: "=&c" (d0), "=&D" (d1)
:"a" (0), "1" (page), "0" (1024)
:"memory");
}
void mmx_clear_page(void *page)
{
if (unlikely(in_interrupt()))
slow_zero_page(page);
else
fast_clear_page(page);
}
EXPORT_SYMBOL(mmx_clear_page);
static void slow_copy_page(void *to, void *from)
{
int d0, d1, d2;
__asm__ __volatile__(
"cld\n\t"
"rep ; movsl"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (1024), "1" ((long) to), "2" ((long) from)
: "memory");
}
void mmx_copy_page(void *to, void *from)
{
if (unlikely(in_interrupt()))
slow_copy_page(to, from);
else
fast_copy_page(to, from);
}
EXPORT_SYMBOL(mmx_copy_page);
+5
View File
@@ -0,0 +1,5 @@
#include <linux/module.h>
#include <asm/msr.h>
EXPORT_SYMBOL(native_rdmsr_safe_regs);
EXPORT_SYMBOL(native_wrmsr_safe_regs);
+102
View File
@@ -0,0 +1,102 @@
#include <linux/linkage.h>
#include <linux/errno.h>
#include <asm/dwarf2.h>
#include <asm/asm.h>
#include <asm/msr.h>
#ifdef CONFIG_X86_64
/*
* int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
*
* reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
*
*/
.macro op_safe_regs op
ENTRY(native_\op\()_safe_regs)
CFI_STARTPROC
pushq_cfi %rbx
pushq_cfi %rbp
movq %rdi, %r10 /* Save pointer */
xorl %r11d, %r11d /* Return value */
movl (%rdi), %eax
movl 4(%rdi), %ecx
movl 8(%rdi), %edx
movl 12(%rdi), %ebx
movl 20(%rdi), %ebp
movl 24(%rdi), %esi
movl 28(%rdi), %edi
CFI_REMEMBER_STATE
1: \op
2: movl %eax, (%r10)
movl %r11d, %eax /* Return value */
movl %ecx, 4(%r10)
movl %edx, 8(%r10)
movl %ebx, 12(%r10)
movl %ebp, 20(%r10)
movl %esi, 24(%r10)
movl %edi, 28(%r10)
popq_cfi %rbp
popq_cfi %rbx
ret
3:
CFI_RESTORE_STATE
movl $-EIO, %r11d
jmp 2b
_ASM_EXTABLE(1b, 3b)
CFI_ENDPROC
ENDPROC(native_\op\()_safe_regs)
.endm
#else /* X86_32 */
.macro op_safe_regs op
ENTRY(native_\op\()_safe_regs)
CFI_STARTPROC
pushl_cfi %ebx
pushl_cfi %ebp
pushl_cfi %esi
pushl_cfi %edi
pushl_cfi $0 /* Return value */
pushl_cfi %eax
movl 4(%eax), %ecx
movl 8(%eax), %edx
movl 12(%eax), %ebx
movl 20(%eax), %ebp
movl 24(%eax), %esi
movl 28(%eax), %edi
movl (%eax), %eax
CFI_REMEMBER_STATE
1: \op
2: pushl_cfi %eax
movl 4(%esp), %eax
popl_cfi (%eax)
addl $4, %esp
CFI_ADJUST_CFA_OFFSET -4
movl %ecx, 4(%eax)
movl %edx, 8(%eax)
movl %ebx, 12(%eax)
movl %ebp, 20(%eax)
movl %esi, 24(%eax)
movl %edi, 28(%eax)
popl_cfi %eax
popl_cfi %edi
popl_cfi %esi
popl_cfi %ebp
popl_cfi %ebx
ret
3:
CFI_RESTORE_STATE
movl $-EIO, 4(%esp)
jmp 2b
_ASM_EXTABLE(1b, 3b)
CFI_ENDPROC
ENDPROC(native_\op\()_safe_regs)
.endm
#endif
op_safe_regs rdmsr
op_safe_regs wrmsr
+204
View File
@@ -0,0 +1,204 @@
#include <linux/module.h>
#include <linux/preempt.h>
#include <linux/smp.h>
#include <asm/msr.h>
static void __rdmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
struct msr *reg;
int this_cpu = raw_smp_processor_id();
if (rv->msrs)
reg = per_cpu_ptr(rv->msrs, this_cpu);
else
reg = &rv->reg;
rdmsr(rv->msr_no, reg->l, reg->h);
}
static void __wrmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
struct msr *reg;
int this_cpu = raw_smp_processor_id();
if (rv->msrs)
reg = per_cpu_ptr(rv->msrs, this_cpu);
else
reg = &rv->reg;
wrmsr(rv->msr_no, reg->l, reg->h);
}
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
int err;
struct msr_info rv;
memset(&rv, 0, sizeof(rv));
rv.msr_no = msr_no;
err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
*l = rv.reg.l;
*h = rv.reg.h;
return err;
}
EXPORT_SYMBOL(rdmsr_on_cpu);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
int err;
struct msr_info rv;
memset(&rv, 0, sizeof(rv));
rv.msr_no = msr_no;
rv.reg.l = l;
rv.reg.h = h;
err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
return err;
}
EXPORT_SYMBOL(wrmsr_on_cpu);
static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
struct msr *msrs,
void (*msr_func) (void *info))
{
struct msr_info rv;
int this_cpu;
memset(&rv, 0, sizeof(rv));
rv.msrs = msrs;
rv.msr_no = msr_no;
this_cpu = get_cpu();
if (cpumask_test_cpu(this_cpu, mask))
msr_func(&rv);
smp_call_function_many(mask, msr_func, &rv, 1);
put_cpu();
}
/* rdmsr on a bunch of CPUs
*
* @mask: which CPUs
* @msr_no: which MSR
* @msrs: array of MSR values
*
*/
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
{
__rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
}
EXPORT_SYMBOL(rdmsr_on_cpus);
/*
* wrmsr on a bunch of CPUs
*
* @mask: which CPUs
* @msr_no: which MSR
* @msrs: array of MSR values
*
*/
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
{
__rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
}
EXPORT_SYMBOL(wrmsr_on_cpus);
/* These "safe" variants are slower and should be used when the target MSR
may not actually exist. */
static void __rdmsr_safe_on_cpu(void *info)
{
struct msr_info *rv = info;
rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
}
static void __wrmsr_safe_on_cpu(void *info)
{
struct msr_info *rv = info;
rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
}
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
int err;
struct msr_info rv;
memset(&rv, 0, sizeof(rv));
rv.msr_no = msr_no;
err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
*l = rv.reg.l;
*h = rv.reg.h;
return err ? err : rv.err;
}
EXPORT_SYMBOL(rdmsr_safe_on_cpu);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
int err;
struct msr_info rv;
memset(&rv, 0, sizeof(rv));
rv.msr_no = msr_no;
rv.reg.l = l;
rv.reg.h = h;
err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
return err ? err : rv.err;
}
EXPORT_SYMBOL(wrmsr_safe_on_cpu);
/*
* These variants are significantly slower, but allows control over
* the entire 32-bit GPR set.
*/
static void __rdmsr_safe_regs_on_cpu(void *info)
{
struct msr_regs_info *rv = info;
rv->err = rdmsr_safe_regs(rv->regs);
}
static void __wrmsr_safe_regs_on_cpu(void *info)
{
struct msr_regs_info *rv = info;
rv->err = wrmsr_safe_regs(rv->regs);
}
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
{
int err;
struct msr_regs_info rv;
rv.regs = regs;
rv.err = -EIO;
err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
return err ? err : rv.err;
}
EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
{
int err;
struct msr_regs_info rv;
rv.regs = regs;
rv.err = -EIO;
err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
return err ? err : rv.err;
}
EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
+23
View File
@@ -0,0 +1,23 @@
#include <linux/module.h>
#include <linux/preempt.h>
#include <asm/msr.h>
struct msr *msrs_alloc(void)
{
struct msr *msrs = NULL;
msrs = alloc_percpu(struct msr);
if (!msrs) {
pr_warning("%s: error allocating msrs\n", __func__);
return NULL;
}
return msrs;
}
EXPORT_SYMBOL(msrs_alloc);
void msrs_free(struct msr *msrs)
{
free_percpu(msrs);
}
EXPORT_SYMBOL(msrs_free);
+97
View File
@@ -0,0 +1,97 @@
/*
* __put_user functions.
*
* (C) Copyright 2005 Linus Torvalds
* (C) Copyright 2005 Andi Kleen
* (C) Copyright 2008 Glauber Costa
*
* These functions have a non-standard call interface
* to make them more efficient, especially as they
* return an error value in addition to the "real"
* return value.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/thread_info.h>
#include <asm/errno.h>
#include <asm/asm.h>
/*
* __put_user_X
*
* Inputs: %eax[:%edx] contains the data
* %ecx contains the address
*
* Outputs: %eax is error code (0 or -EFAULT)
*
* These functions should not modify any other registers,
* as they get called from within inline assembly.
*/
#define ENTER CFI_STARTPROC ; \
GET_THREAD_INFO(%_ASM_BX)
#define EXIT ret ; \
CFI_ENDPROC
.text
ENTRY(__put_user_1)
ENTER
cmp TI_addr_limit(%_ASM_BX),%_ASM_CX
jae bad_put_user
1: movb %al,(%_ASM_CX)
xor %eax,%eax
EXIT
ENDPROC(__put_user_1)
ENTRY(__put_user_2)
ENTER
mov TI_addr_limit(%_ASM_BX),%_ASM_BX
sub $1,%_ASM_BX
cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
2: movw %ax,(%_ASM_CX)
xor %eax,%eax
EXIT
ENDPROC(__put_user_2)
ENTRY(__put_user_4)
ENTER
mov TI_addr_limit(%_ASM_BX),%_ASM_BX
sub $3,%_ASM_BX
cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
3: movl %eax,(%_ASM_CX)
xor %eax,%eax
EXIT
ENDPROC(__put_user_4)
ENTRY(__put_user_8)
ENTER
mov TI_addr_limit(%_ASM_BX),%_ASM_BX
sub $7,%_ASM_BX
cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
4: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
5: movl %edx,4(%_ASM_CX)
#endif
xor %eax,%eax
EXIT
ENDPROC(__put_user_8)
bad_put_user:
CFI_STARTPROC
movl $-EFAULT,%eax
EXIT
END(bad_put_user)
.section __ex_table,"a"
_ASM_PTR 1b,bad_put_user
_ASM_PTR 2b,bad_put_user
_ASM_PTR 3b,bad_put_user
_ASM_PTR 4b,bad_put_user
#ifdef CONFIG_X86_32
_ASM_PTR 5b,bad_put_user
#endif
.previous
+44
View File
@@ -0,0 +1,44 @@
/* Slow paths of read/write spinlocks. */
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
#include <asm/frame.h>
#include <asm/rwlock.h>
#ifdef CONFIG_X86_32
# define __lock_ptr eax
#else
# define __lock_ptr rdi
#endif
ENTRY(__write_lock_failed)
CFI_STARTPROC
FRAME
0: LOCK_PREFIX
WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
1: rep; nop
cmpl $WRITE_LOCK_CMP, (%__lock_ptr)
jne 1b
LOCK_PREFIX
WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
jnz 0b
ENDFRAME
ret
CFI_ENDPROC
END(__write_lock_failed)
ENTRY(__read_lock_failed)
CFI_STARTPROC
FRAME
0: LOCK_PREFIX
READ_LOCK_SIZE(inc) (%__lock_ptr)
1: rep; nop
READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
js 1b
LOCK_PREFIX
READ_LOCK_SIZE(dec) (%__lock_ptr)
js 0b
ENDFRAME
ret
CFI_ENDPROC
END(__read_lock_failed)
+136
View File
@@ -0,0 +1,136 @@
/*
* x86 semaphore implementation.
*
* (C) Copyright 1999 Linus Torvalds
*
* Portions Copyright 1999 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
*/
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
#include <asm/dwarf2.h>
#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
#ifdef CONFIG_X86_32
/*
* The semaphore operations have a special calling sequence that
* allow us to do a simpler in-line version of them. These routines
* need to convert that sequence back into the C sequence when
* there is contention on the semaphore.
*
* %eax contains the semaphore pointer on entry. Save the C-clobbered
* registers (%eax, %edx and %ecx) except %eax whish is either a return
* value or just clobbered..
*/
#define save_common_regs \
pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
#define restore_common_regs \
popl_cfi %ecx; CFI_RESTORE ecx
/* Avoid uglifying the argument copying x86-64 needs to do. */
.macro movq src, dst
.endm
#else
/*
* x86-64 rwsem wrappers
*
* This interfaces the inline asm code to the slow-path
* C routines. We need to save the call-clobbered regs
* that the asm does not mark as clobbered, and move the
* argument from %rax to %rdi.
*
* NOTE! We don't need to save %rax, because the functions
* will always return the semaphore pointer in %rax (which
* is also the input argument to these helpers)
*
* The following can clobber %rdx because the asm clobbers it:
* call_rwsem_down_write_failed
* call_rwsem_wake
* but %rdi, %rsi, %rcx, %r8-r11 always need saving.
*/
#define save_common_regs \
pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
pushq_cfi %r11; CFI_REL_OFFSET r11, 0
#define restore_common_regs \
popq_cfi %r11; CFI_RESTORE r11; \
popq_cfi %r10; CFI_RESTORE r10; \
popq_cfi %r9; CFI_RESTORE r9; \
popq_cfi %r8; CFI_RESTORE r8; \
popq_cfi %rcx; CFI_RESTORE rcx; \
popq_cfi %rsi; CFI_RESTORE rsi; \
popq_cfi %rdi; CFI_RESTORE rdi
#endif
/* Fix up special calling conventions */
ENTRY(call_rwsem_down_read_failed)
CFI_STARTPROC
save_common_regs
__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_down_read_failed
__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
ENDPROC(call_rwsem_down_read_failed)
ENTRY(call_rwsem_down_write_failed)
CFI_STARTPROC
save_common_regs
movq %rax,%rdi
call rwsem_down_write_failed
restore_common_regs
ret
CFI_ENDPROC
ENDPROC(call_rwsem_down_write_failed)
ENTRY(call_rwsem_wake)
CFI_STARTPROC
/* do nothing if still outstanding active readers */
__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
jnz 1f
save_common_regs
movq %rax,%rdi
call rwsem_wake
restore_common_regs
1: ret
CFI_ENDPROC
ENDPROC(call_rwsem_wake)
ENTRY(call_rwsem_downgrade_wake)
CFI_STARTPROC
save_common_regs
__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_downgrade_wake
__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
ENDPROC(call_rwsem_downgrade_wake)
+235
View File
@@ -0,0 +1,235 @@
/*
* Most of the string-functions are rather heavily hand-optimized,
* see especially strsep,strstr,str[c]spn. They should work, but are not
* very easy to understand. Everything is done entirely within the register
* set, making the functions fast and clean. String instructions have been
* used through-out, making for "slightly" unclear code :-)
*
* AK: On P4 and K7 using non string instruction implementations might be faster
* for large memory blocks. But most of them are unlikely to be used on large
* strings.
*/
#include <linux/string.h>
#include <linux/module.h>
#ifdef __HAVE_ARCH_STRCPY
char *strcpy(char *dest, const char *src)
{
int d0, d1, d2;
asm volatile("1:\tlodsb\n\t"
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
: "0" (src), "1" (dest) : "memory");
return dest;
}
EXPORT_SYMBOL(strcpy);
#endif
#ifdef __HAVE_ARCH_STRNCPY
char *strncpy(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
asm volatile("1:\tdecl %2\n\t"
"js 2f\n\t"
"lodsb\n\t"
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n\t"
"rep\n\t"
"stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
: "0" (src), "1" (dest), "2" (count) : "memory");
return dest;
}
EXPORT_SYMBOL(strncpy);
#endif
#ifdef __HAVE_ARCH_STRCAT
char *strcat(char *dest, const char *src)
{
int d0, d1, d2, d3;
asm volatile("repne\n\t"
"scasb\n\t"
"decl %1\n"
"1:\tlodsb\n\t"
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
: "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
return dest;
}
EXPORT_SYMBOL(strcat);
#endif
#ifdef __HAVE_ARCH_STRNCAT
char *strncat(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
asm volatile("repne\n\t"
"scasb\n\t"
"decl %1\n\t"
"movl %8,%3\n"
"1:\tdecl %3\n\t"
"js 2f\n\t"
"lodsb\n\t"
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n"
"2:\txorl %2,%2\n\t"
"stosb"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
: "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu), "g" (count)
: "memory");
return dest;
}
EXPORT_SYMBOL(strncat);
#endif
#ifdef __HAVE_ARCH_STRCMP
int strcmp(const char *cs, const char *ct)
{
int d0, d1;
int res;
asm volatile("1:\tlodsb\n\t"
"scasb\n\t"
"jne 2f\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"jmp 3f\n"
"2:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"3:"
: "=a" (res), "=&S" (d0), "=&D" (d1)
: "1" (cs), "2" (ct)
: "memory");
return res;
}
EXPORT_SYMBOL(strcmp);
#endif
#ifdef __HAVE_ARCH_STRNCMP
int strncmp(const char *cs, const char *ct, size_t count)
{
int res;
int d0, d1, d2;
asm volatile("1:\tdecl %3\n\t"
"js 2f\n\t"
"lodsb\n\t"
"scasb\n\t"
"jne 3f\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n"
"2:\txorl %%eax,%%eax\n\t"
"jmp 4f\n"
"3:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"4:"
: "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
: "1" (cs), "2" (ct), "3" (count)
: "memory");
return res;
}
EXPORT_SYMBOL(strncmp);
#endif
#ifdef __HAVE_ARCH_STRCHR
char *strchr(const char *s, int c)
{
int d0;
char *res;
asm volatile("movb %%al,%%ah\n"
"1:\tlodsb\n\t"
"cmpb %%ah,%%al\n\t"
"je 2f\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n\t"
"movl $1,%1\n"
"2:\tmovl %1,%0\n\t"
"decl %0"
: "=a" (res), "=&S" (d0)
: "1" (s), "0" (c)
: "memory");
return res;
}
EXPORT_SYMBOL(strchr);
#endif
#ifdef __HAVE_ARCH_STRLEN
size_t strlen(const char *s)
{
int d0;
size_t res;
asm volatile("repne\n\t"
"scasb"
: "=c" (res), "=&D" (d0)
: "1" (s), "a" (0), "0" (0xffffffffu)
: "memory");
return ~res - 1;
}
EXPORT_SYMBOL(strlen);
#endif
#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *cs, int c, size_t count)
{
int d0;
void *res;
if (!count)
return NULL;
asm volatile("repne\n\t"
"scasb\n\t"
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
: "=D" (res), "=&c" (d0)
: "a" (c), "0" (cs), "1" (count)
: "memory");
return res;
}
EXPORT_SYMBOL(memchr);
#endif
#ifdef __HAVE_ARCH_MEMSCAN
void *memscan(void *addr, int c, size_t size)
{
if (!size)
return addr;
asm volatile("repnz; scasb\n\t"
"jnz 1f\n\t"
"dec %%edi\n"
"1:"
: "=D" (addr), "=c" (size)
: "0" (addr), "1" (size), "a" (c)
: "memory");
return addr;
}
EXPORT_SYMBOL(memscan);
#endif
#ifdef __HAVE_ARCH_STRNLEN
size_t strnlen(const char *s, size_t count)
{
int d0;
int res;
asm volatile("movl %2,%0\n\t"
"jmp 2f\n"
"1:\tcmpb $0,(%0)\n\t"
"je 3f\n\t"
"incl %0\n"
"2:\tdecl %1\n\t"
"cmpl $-1,%1\n\t"
"jne 1b\n"
"3:\tsubl %2,%0"
: "=a" (res), "=&d" (d0)
: "c" (s), "1" (count)
: "memory");
return res;
}
EXPORT_SYMBOL(strnlen);
#endif
+31
View File
@@ -0,0 +1,31 @@
#include <linux/string.h>
char *strstr(const char *cs, const char *ct)
{
int d0, d1;
register char *__res;
__asm__ __volatile__(
"movl %6,%%edi\n\t"
"repne\n\t"
"scasb\n\t"
"notl %%ecx\n\t"
"decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */
"movl %%ecx,%%edx\n"
"1:\tmovl %6,%%edi\n\t"
"movl %%esi,%%eax\n\t"
"movl %%edx,%%ecx\n\t"
"repe\n\t"
"cmpsb\n\t"
"je 2f\n\t" /* also works for empty string, see above */
"xchgl %%eax,%%esi\n\t"
"incl %%esi\n\t"
"cmpb $0,-1(%%eax)\n\t"
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"2:"
: "=a" (__res), "=&c" (d0), "=&S" (d1)
: "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
: "dx", "di");
return __res;
}
+29
View File
@@ -0,0 +1,29 @@
/*
* Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
* Copyright 2008 by Steven Rostedt, Red Hat, Inc
* (inspired by Andi Kleen's thunk_64.S)
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
#include <linux/linkage.h>
#ifdef CONFIG_TRACE_IRQFLAGS
/* put return address in eax (arg1) */
.macro thunk_ra name,func
.globl \name
\name:
pushl %eax
pushl %ecx
pushl %edx
/* Place EIP in the arg1 */
movl 3*4(%esp), %eax
call \func
popl %edx
popl %ecx
popl %eax
ret
.endm
thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
#endif
+45
View File
@@ -0,0 +1,45 @@
/*
* Save registers before calling assembly functions. This avoids
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
* Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/calling.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
\name:
CFI_STARTPROC
/* this one pushes 9 elems, the next one would be %rIP */
SAVE_ARGS
.if \put_ret_addr_in_rdi
movq_cfi_restore 9*8, rdi
.endif
call \func
jmp restore
CFI_ENDPROC
.endm
#ifdef CONFIG_TRACE_IRQFLAGS
THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
/* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
SAVE_ARGS
restore:
RESTORE_ARGS
ret
CFI_ENDPROC
+146
View File
@@ -0,0 +1,146 @@
/*
* User address space access functions.
*
* For licencing details see kernel-base/COPYING
*/
#include <linux/highmem.h>
#include <linux/module.h>
#include <asm/word-at-a-time.h>
/*
* best effort, GUP based copy_from_user() that is NMI-safe
*/
unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
unsigned long offset, addr = (unsigned long)from;
unsigned long size, len = 0;
struct page *page;
void *map;
int ret;
do {
ret = __get_user_pages_fast(addr, 1, 0, &page);
if (!ret)
break;
offset = addr & (PAGE_SIZE - 1);
size = min(PAGE_SIZE - offset, n - len);
map = kmap_atomic(page);
memcpy(to, map+offset, size);
kunmap_atomic(map);
put_page(page);
len += size;
to += size;
addr += size;
} while (len < n);
return len;
}
EXPORT_SYMBOL_GPL(copy_from_user_nmi);
static inline unsigned long count_bytes(unsigned long mask)
{
mask = (mask - 1) & ~mask;
mask >>= 7;
return count_masked_bytes(mask);
}
/*
* Do a strncpy, return length of string without final '\0'.
* 'count' is the user-supplied count (return 'count' if we
* hit it), 'max' is the address space maximum (and we return
* -EFAULT if we hit it).
*/
static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max)
{
long res = 0;
/*
* Truncate 'max' to the user-specified limit, so that
* we only have one limit we need to check in the loop
*/
if (max > count)
max = count;
while (max >= sizeof(unsigned long)) {
unsigned long c;
/* Fall back to byte-at-a-time if we get a page fault */
if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
break;
/* This can write a few bytes past the NUL character, but that's ok */
*(unsigned long *)(dst+res) = c;
c = has_zero(c);
if (c)
return res + count_bytes(c);
res += sizeof(unsigned long);
max -= sizeof(unsigned long);
}
while (max) {
char c;
if (unlikely(__get_user(c,src+res)))
return -EFAULT;
dst[res] = c;
if (!c)
return res;
res++;
max--;
}
/*
* Uhhuh. We hit 'max'. But was that the user-specified maximum
* too? If so, that's ok - we got as much as the user asked for.
*/
if (res >= count)
return res;
/*
* Nope: we hit the address space limit, and we still had more
* characters the caller would have wanted. That's an EFAULT.
*/
return -EFAULT;
}
/**
* strncpy_from_user: - Copy a NUL terminated string from userspace.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
* @src: Source address, in user space.
* @count: Maximum number of bytes to copy, including the trailing NUL.
*
* Copies a NUL-terminated string from userspace to kernel space.
*
* On success, returns the length of the string (not including the trailing
* NUL).
*
* If access to userspace fails, returns -EFAULT (some data may have been
* copied).
*
* If @count is smaller than the length of the string, copies @count bytes
* and returns @count.
*/
long
strncpy_from_user(char *dst, const char __user *src, long count)
{
unsigned long max_addr, src_addr;
if (unlikely(count <= 0))
return 0;
max_addr = current_thread_info()->addr_limit.seg;
src_addr = (unsigned long)src;
if (likely(src_addr < max_addr)) {
unsigned long max = max_addr - src_addr;
return do_strncpy_from_user(dst, src, count, max);
}
return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);
+804
View File
@@ -0,0 +1,804 @@
/*
* User address space access functions.
* The non inlined parts of asm-i386/uaccess.h are here.
*
* Copyright 1997 Andi Kleen <ak@muc.de>
* Copyright 1997 Linus Torvalds
*/
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/interrupt.h>
#include <asm/uaccess.h>
#include <asm/mmx.h>
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
* Alignment at which movsl is preferred for bulk memory copies.
*/
struct movsl_mask movsl_mask __read_mostly;
#endif
static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
{
#ifdef CONFIG_X86_INTEL_USERCOPY
if (n >= 64 && ((a1 ^ a2) & movsl_mask.mask))
return 0;
#endif
return 1;
}
#define movsl_is_ok(a1, a2, n) \
__movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n))
/*
* Zero Userspace
*/
#define __do_clear_user(addr,size) \
do { \
int __d0; \
might_fault(); \
__asm__ __volatile__( \
"0: rep; stosl\n" \
" movl %2,%0\n" \
"1: rep; stosb\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: lea 0(%2,%0,4),%0\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE(0b,3b) \
_ASM_EXTABLE(1b,2b) \
: "=&c"(size), "=&D" (__d0) \
: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
} while (0)
/**
* clear_user: - Zero a block of memory in user space.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space.
*
* Returns number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
clear_user(void __user *to, unsigned long n)
{
might_fault();
if (access_ok(VERIFY_WRITE, to, n))
__do_clear_user(to, n);
return n;
}
EXPORT_SYMBOL(clear_user);
/**
* __clear_user: - Zero a block of memory in user space, with less checking.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space. Caller must check
* the specified block with access_ok() before calling this function.
*
* Returns number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
__clear_user(void __user *to, unsigned long n)
{
__do_clear_user(to, n);
return n;
}
EXPORT_SYMBOL(__clear_user);
/**
* strnlen_user: - Get the size of a string in user space.
* @s: The string to measure.
* @n: The maximum valid length
*
* Get the size of a NUL-terminated string in user space.
*
* Returns the size of the string INCLUDING the terminating NUL.
* On exception, returns 0.
* If the string is too long, returns a value greater than @n.
*/
long strnlen_user(const char __user *s, long n)
{
unsigned long mask = -__addr_ok(s);
unsigned long res, tmp;
might_fault();
__asm__ __volatile__(
" testl %0, %0\n"
" jz 3f\n"
" andl %0,%%ecx\n"
"0: repne; scasb\n"
" setne %%al\n"
" subl %%ecx,%0\n"
" addl %0,%%eax\n"
"1:\n"
".section .fixup,\"ax\"\n"
"2: xorl %%eax,%%eax\n"
" jmp 1b\n"
"3: movb $1,%%al\n"
" jmp 1b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 0b,2b\n"
".previous"
:"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp)
:"0" (n), "1" (s), "2" (0), "3" (mask)
:"cc");
return res & mask;
}
EXPORT_SYMBOL(strnlen_user);
#ifdef CONFIG_X86_INTEL_USERCOPY
static unsigned long
__copy_user_intel(void __user *to, const void *from, unsigned long size)
{
int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
"1: movl 32(%4), %%eax\n"
" cmpl $67, %0\n"
" jbe 3f\n"
"2: movl 64(%4), %%eax\n"
" .align 2,0x90\n"
"3: movl 0(%4), %%eax\n"
"4: movl 4(%4), %%edx\n"
"5: movl %%eax, 0(%3)\n"
"6: movl %%edx, 4(%3)\n"
"7: movl 8(%4), %%eax\n"
"8: movl 12(%4),%%edx\n"
"9: movl %%eax, 8(%3)\n"
"10: movl %%edx, 12(%3)\n"
"11: movl 16(%4), %%eax\n"
"12: movl 20(%4), %%edx\n"
"13: movl %%eax, 16(%3)\n"
"14: movl %%edx, 20(%3)\n"
"15: movl 24(%4), %%eax\n"
"16: movl 28(%4), %%edx\n"
"17: movl %%eax, 24(%3)\n"
"18: movl %%edx, 28(%3)\n"
"19: movl 32(%4), %%eax\n"
"20: movl 36(%4), %%edx\n"
"21: movl %%eax, 32(%3)\n"
"22: movl %%edx, 36(%3)\n"
"23: movl 40(%4), %%eax\n"
"24: movl 44(%4), %%edx\n"
"25: movl %%eax, 40(%3)\n"
"26: movl %%edx, 44(%3)\n"
"27: movl 48(%4), %%eax\n"
"28: movl 52(%4), %%edx\n"
"29: movl %%eax, 48(%3)\n"
"30: movl %%edx, 52(%3)\n"
"31: movl 56(%4), %%eax\n"
"32: movl 60(%4), %%edx\n"
"33: movl %%eax, 56(%3)\n"
"34: movl %%edx, 60(%3)\n"
" addl $-64, %0\n"
" addl $64, %4\n"
" addl $64, %3\n"
" cmpl $63, %0\n"
" ja 1b\n"
"35: movl %0, %%eax\n"
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
"99: rep; movsl\n"
"36: movl %%eax, %0\n"
"37: rep; movsb\n"
"100:\n"
".section .fixup,\"ax\"\n"
"101: lea 0(%%eax,%0,4),%0\n"
" jmp 100b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 1b,100b\n"
" .long 2b,100b\n"
" .long 3b,100b\n"
" .long 4b,100b\n"
" .long 5b,100b\n"
" .long 6b,100b\n"
" .long 7b,100b\n"
" .long 8b,100b\n"
" .long 9b,100b\n"
" .long 10b,100b\n"
" .long 11b,100b\n"
" .long 12b,100b\n"
" .long 13b,100b\n"
" .long 14b,100b\n"
" .long 15b,100b\n"
" .long 16b,100b\n"
" .long 17b,100b\n"
" .long 18b,100b\n"
" .long 19b,100b\n"
" .long 20b,100b\n"
" .long 21b,100b\n"
" .long 22b,100b\n"
" .long 23b,100b\n"
" .long 24b,100b\n"
" .long 25b,100b\n"
" .long 26b,100b\n"
" .long 27b,100b\n"
" .long 28b,100b\n"
" .long 29b,100b\n"
" .long 30b,100b\n"
" .long 31b,100b\n"
" .long 32b,100b\n"
" .long 33b,100b\n"
" .long 34b,100b\n"
" .long 35b,100b\n"
" .long 36b,100b\n"
" .long 37b,100b\n"
" .long 99b,101b\n"
".previous"
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
return size;
}
static unsigned long
__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
{
int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
"0: movl 32(%4), %%eax\n"
" cmpl $67, %0\n"
" jbe 2f\n"
"1: movl 64(%4), %%eax\n"
" .align 2,0x90\n"
"2: movl 0(%4), %%eax\n"
"21: movl 4(%4), %%edx\n"
" movl %%eax, 0(%3)\n"
" movl %%edx, 4(%3)\n"
"3: movl 8(%4), %%eax\n"
"31: movl 12(%4),%%edx\n"
" movl %%eax, 8(%3)\n"
" movl %%edx, 12(%3)\n"
"4: movl 16(%4), %%eax\n"
"41: movl 20(%4), %%edx\n"
" movl %%eax, 16(%3)\n"
" movl %%edx, 20(%3)\n"
"10: movl 24(%4), %%eax\n"
"51: movl 28(%4), %%edx\n"
" movl %%eax, 24(%3)\n"
" movl %%edx, 28(%3)\n"
"11: movl 32(%4), %%eax\n"
"61: movl 36(%4), %%edx\n"
" movl %%eax, 32(%3)\n"
" movl %%edx, 36(%3)\n"
"12: movl 40(%4), %%eax\n"
"71: movl 44(%4), %%edx\n"
" movl %%eax, 40(%3)\n"
" movl %%edx, 44(%3)\n"
"13: movl 48(%4), %%eax\n"
"81: movl 52(%4), %%edx\n"
" movl %%eax, 48(%3)\n"
" movl %%edx, 52(%3)\n"
"14: movl 56(%4), %%eax\n"
"91: movl 60(%4), %%edx\n"
" movl %%eax, 56(%3)\n"
" movl %%edx, 60(%3)\n"
" addl $-64, %0\n"
" addl $64, %4\n"
" addl $64, %3\n"
" cmpl $63, %0\n"
" ja 0b\n"
"5: movl %0, %%eax\n"
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
"6: rep; movsl\n"
" movl %%eax,%0\n"
"7: rep; movsb\n"
"8:\n"
".section .fixup,\"ax\"\n"
"9: lea 0(%%eax,%0,4),%0\n"
"16: pushl %0\n"
" pushl %%eax\n"
" xorl %%eax,%%eax\n"
" rep; stosb\n"
" popl %%eax\n"
" popl %0\n"
" jmp 8b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 0b,16b\n"
" .long 1b,16b\n"
" .long 2b,16b\n"
" .long 21b,16b\n"
" .long 3b,16b\n"
" .long 31b,16b\n"
" .long 4b,16b\n"
" .long 41b,16b\n"
" .long 10b,16b\n"
" .long 51b,16b\n"
" .long 11b,16b\n"
" .long 61b,16b\n"
" .long 12b,16b\n"
" .long 71b,16b\n"
" .long 13b,16b\n"
" .long 81b,16b\n"
" .long 14b,16b\n"
" .long 91b,16b\n"
" .long 6b,9b\n"
" .long 7b,16b\n"
".previous"
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
return size;
}
/*
* Non Temporal Hint version of __copy_user_zeroing_intel. It is cache aware.
* hyoshiok@miraclelinux.com
*/
static unsigned long __copy_user_zeroing_intel_nocache(void *to,
const void __user *from, unsigned long size)
{
int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
"0: movl 32(%4), %%eax\n"
" cmpl $67, %0\n"
" jbe 2f\n"
"1: movl 64(%4), %%eax\n"
" .align 2,0x90\n"
"2: movl 0(%4), %%eax\n"
"21: movl 4(%4), %%edx\n"
" movnti %%eax, 0(%3)\n"
" movnti %%edx, 4(%3)\n"
"3: movl 8(%4), %%eax\n"
"31: movl 12(%4),%%edx\n"
" movnti %%eax, 8(%3)\n"
" movnti %%edx, 12(%3)\n"
"4: movl 16(%4), %%eax\n"
"41: movl 20(%4), %%edx\n"
" movnti %%eax, 16(%3)\n"
" movnti %%edx, 20(%3)\n"
"10: movl 24(%4), %%eax\n"
"51: movl 28(%4), %%edx\n"
" movnti %%eax, 24(%3)\n"
" movnti %%edx, 28(%3)\n"
"11: movl 32(%4), %%eax\n"
"61: movl 36(%4), %%edx\n"
" movnti %%eax, 32(%3)\n"
" movnti %%edx, 36(%3)\n"
"12: movl 40(%4), %%eax\n"
"71: movl 44(%4), %%edx\n"
" movnti %%eax, 40(%3)\n"
" movnti %%edx, 44(%3)\n"
"13: movl 48(%4), %%eax\n"
"81: movl 52(%4), %%edx\n"
" movnti %%eax, 48(%3)\n"
" movnti %%edx, 52(%3)\n"
"14: movl 56(%4), %%eax\n"
"91: movl 60(%4), %%edx\n"
" movnti %%eax, 56(%3)\n"
" movnti %%edx, 60(%3)\n"
" addl $-64, %0\n"
" addl $64, %4\n"
" addl $64, %3\n"
" cmpl $63, %0\n"
" ja 0b\n"
" sfence \n"
"5: movl %0, %%eax\n"
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
"6: rep; movsl\n"
" movl %%eax,%0\n"
"7: rep; movsb\n"
"8:\n"
".section .fixup,\"ax\"\n"
"9: lea 0(%%eax,%0,4),%0\n"
"16: pushl %0\n"
" pushl %%eax\n"
" xorl %%eax,%%eax\n"
" rep; stosb\n"
" popl %%eax\n"
" popl %0\n"
" jmp 8b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 0b,16b\n"
" .long 1b,16b\n"
" .long 2b,16b\n"
" .long 21b,16b\n"
" .long 3b,16b\n"
" .long 31b,16b\n"
" .long 4b,16b\n"
" .long 41b,16b\n"
" .long 10b,16b\n"
" .long 51b,16b\n"
" .long 11b,16b\n"
" .long 61b,16b\n"
" .long 12b,16b\n"
" .long 71b,16b\n"
" .long 13b,16b\n"
" .long 81b,16b\n"
" .long 14b,16b\n"
" .long 91b,16b\n"
" .long 6b,9b\n"
" .long 7b,16b\n"
".previous"
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
return size;
}
static unsigned long __copy_user_intel_nocache(void *to,
const void __user *from, unsigned long size)
{
int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
"0: movl 32(%4), %%eax\n"
" cmpl $67, %0\n"
" jbe 2f\n"
"1: movl 64(%4), %%eax\n"
" .align 2,0x90\n"
"2: movl 0(%4), %%eax\n"
"21: movl 4(%4), %%edx\n"
" movnti %%eax, 0(%3)\n"
" movnti %%edx, 4(%3)\n"
"3: movl 8(%4), %%eax\n"
"31: movl 12(%4),%%edx\n"
" movnti %%eax, 8(%3)\n"
" movnti %%edx, 12(%3)\n"
"4: movl 16(%4), %%eax\n"
"41: movl 20(%4), %%edx\n"
" movnti %%eax, 16(%3)\n"
" movnti %%edx, 20(%3)\n"
"10: movl 24(%4), %%eax\n"
"51: movl 28(%4), %%edx\n"
" movnti %%eax, 24(%3)\n"
" movnti %%edx, 28(%3)\n"
"11: movl 32(%4), %%eax\n"
"61: movl 36(%4), %%edx\n"
" movnti %%eax, 32(%3)\n"
" movnti %%edx, 36(%3)\n"
"12: movl 40(%4), %%eax\n"
"71: movl 44(%4), %%edx\n"
" movnti %%eax, 40(%3)\n"
" movnti %%edx, 44(%3)\n"
"13: movl 48(%4), %%eax\n"
"81: movl 52(%4), %%edx\n"
" movnti %%eax, 48(%3)\n"
" movnti %%edx, 52(%3)\n"
"14: movl 56(%4), %%eax\n"
"91: movl 60(%4), %%edx\n"
" movnti %%eax, 56(%3)\n"
" movnti %%edx, 60(%3)\n"
" addl $-64, %0\n"
" addl $64, %4\n"
" addl $64, %3\n"
" cmpl $63, %0\n"
" ja 0b\n"
" sfence \n"
"5: movl %0, %%eax\n"
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
"6: rep; movsl\n"
" movl %%eax,%0\n"
"7: rep; movsb\n"
"8:\n"
".section .fixup,\"ax\"\n"
"9: lea 0(%%eax,%0,4),%0\n"
"16: jmp 8b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 0b,16b\n"
" .long 1b,16b\n"
" .long 2b,16b\n"
" .long 21b,16b\n"
" .long 3b,16b\n"
" .long 31b,16b\n"
" .long 4b,16b\n"
" .long 41b,16b\n"
" .long 10b,16b\n"
" .long 51b,16b\n"
" .long 11b,16b\n"
" .long 61b,16b\n"
" .long 12b,16b\n"
" .long 71b,16b\n"
" .long 13b,16b\n"
" .long 81b,16b\n"
" .long 14b,16b\n"
" .long 91b,16b\n"
" .long 6b,9b\n"
" .long 7b,16b\n"
".previous"
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
return size;
}
#else
/*
* Leave these declared but undefined. They should not be any references to
* them
*/
unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
unsigned long size);
unsigned long __copy_user_intel(void __user *to, const void *from,
unsigned long size);
unsigned long __copy_user_zeroing_intel_nocache(void *to,
const void __user *from, unsigned long size);
#endif /* CONFIG_X86_INTEL_USERCOPY */
/* Generic arbitrary sized copy. */
#define __copy_user(to, from, size) \
do { \
int __d0, __d1, __d2; \
__asm__ __volatile__( \
" cmp $7,%0\n" \
" jbe 1f\n" \
" movl %1,%0\n" \
" negl %0\n" \
" andl $7,%0\n" \
" subl %0,%3\n" \
"4: rep; movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
"0: rep; movsl\n" \
" movl %3,%0\n" \
"1: rep; movsb\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"5: addl %3,%0\n" \
" jmp 2b\n" \
"3: lea 0(%3,%0,4),%0\n" \
" jmp 2b\n" \
".previous\n" \
".section __ex_table,\"a\"\n" \
" .align 4\n" \
" .long 4b,5b\n" \
" .long 0b,3b\n" \
" .long 1b,2b\n" \
".previous" \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \
} while (0)
#define __copy_user_zeroing(to, from, size) \
do { \
int __d0, __d1, __d2; \
__asm__ __volatile__( \
" cmp $7,%0\n" \
" jbe 1f\n" \
" movl %1,%0\n" \
" negl %0\n" \
" andl $7,%0\n" \
" subl %0,%3\n" \
"4: rep; movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
"0: rep; movsl\n" \
" movl %3,%0\n" \
"1: rep; movsb\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"5: addl %3,%0\n" \
" jmp 6f\n" \
"3: lea 0(%3,%0,4),%0\n" \
"6: pushl %0\n" \
" pushl %%eax\n" \
" xorl %%eax,%%eax\n" \
" rep; stosb\n" \
" popl %%eax\n" \
" popl %0\n" \
" jmp 2b\n" \
".previous\n" \
".section __ex_table,\"a\"\n" \
" .align 4\n" \
" .long 4b,5b\n" \
" .long 0b,3b\n" \
" .long 1b,6b\n" \
".previous" \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \
} while (0)
unsigned long __copy_to_user_ll(void __user *to, const void *from,
unsigned long n)
{
#ifndef CONFIG_X86_WP_WORKS_OK
if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
((unsigned long)to) < TASK_SIZE) {
/*
* When we are in an atomic section (see
* mm/filemap.c:file_read_actor), return the full
* length to take the slow path.
*/
if (in_atomic())
return n;
/*
* CPU does not honor the WP bit when writing
* from supervisory mode, and due to preemption or SMP,
* the page tables can change at any time.
* Do it manually. Manfred <manfred@colorfullife.com>
*/
while (n) {
unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
unsigned long len = PAGE_SIZE - offset;
int retval;
struct page *pg;
void *maddr;
if (len > n)
len = n;
survive:
down_read(&current->mm->mmap_sem);
retval = get_user_pages(current, current->mm,
(unsigned long)to, 1, 1, 0, &pg, NULL);
if (retval == -ENOMEM && is_global_init(current)) {
up_read(&current->mm->mmap_sem);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto survive;
}
if (retval != 1) {
up_read(&current->mm->mmap_sem);
break;
}
maddr = kmap_atomic(pg);
memcpy(maddr + offset, from, len);
kunmap_atomic(maddr);
set_page_dirty_lock(pg);
put_page(pg);
up_read(&current->mm->mmap_sem);
from += len;
to += len;
n -= len;
}
return n;
}
#endif
if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_intel(to, from, n);
return n;
}
EXPORT_SYMBOL(__copy_to_user_ll);
unsigned long __copy_from_user_ll(void *to, const void __user *from,
unsigned long n)
{
if (movsl_is_ok(to, from, n))
__copy_user_zeroing(to, from, n);
else
n = __copy_user_zeroing_intel(to, from, n);
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll);
unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
unsigned long n)
{
if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_intel((void __user *)to,
(const void *)from, n);
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nozero);
unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
unsigned long n)
{
#ifdef CONFIG_X86_INTEL_USERCOPY
if (n > 64 && cpu_has_xmm2)
n = __copy_user_zeroing_intel_nocache(to, from, n);
else
__copy_user_zeroing(to, from, n);
#else
__copy_user_zeroing(to, from, n);
#endif
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nocache);
unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
unsigned long n)
{
#ifdef CONFIG_X86_INTEL_USERCOPY
if (n > 64 && cpu_has_xmm2)
n = __copy_user_intel_nocache(to, from, n);
else
__copy_user(to, from, n);
#else
__copy_user(to, from, n);
#endif
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
/**
* copy_to_user: - Copy a block of data into user space.
* @to: Destination address, in user space.
* @from: Source address, in kernel space.
* @n: Number of bytes to copy.
*
* Context: User context only. This function may sleep.
*
* Copy data from kernel space to user space.
*
* Returns number of bytes that could not be copied.
* On success, this will be zero.
*/
unsigned long
copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (access_ok(VERIFY_WRITE, to, n))
n = __copy_to_user(to, from, n);
return n;
}
EXPORT_SYMBOL(copy_to_user);
/**
* copy_from_user: - Copy a block of data from user space.
* @to: Destination address, in kernel space.
* @from: Source address, in user space.
* @n: Number of bytes to copy.
*
* Context: User context only. This function may sleep.
*
* Copy data from user space to kernel space.
*
* Returns number of bytes that could not be copied.
* On success, this will be zero.
*
* If some data could not be copied, this function will pad the copied
* data to the requested size using zero bytes.
*/
unsigned long
_copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (access_ok(VERIFY_READ, from, n))
n = __copy_from_user(to, from, n);
else
memset(to, 0, n);
return n;
}
EXPORT_SYMBOL(_copy_from_user);
void copy_from_user_overflow(void)
{
WARN(1, "Buffer overflow detected!\n");
}
EXPORT_SYMBOL(copy_from_user_overflow);
+134
View File
@@ -0,0 +1,134 @@
/*
* User address space access functions.
*
* Copyright 1997 Andi Kleen <ak@muc.de>
* Copyright 1997 Linus Torvalds
* Copyright 2002 Andi Kleen <ak@suse.de>
*/
#include <linux/module.h>
#include <asm/uaccess.h>
/*
* Zero Userspace
*/
unsigned long __clear_user(void __user *addr, unsigned long size)
{
long __d0;
might_fault();
/* no memory constraint because it doesn't change any memory gcc knows
about */
asm volatile(
" testq %[size8],%[size8]\n"
" jz 4f\n"
"0: movq %[zero],(%[dst])\n"
" addq %[eight],%[dst]\n"
" decl %%ecx ; jnz 0b\n"
"4: movq %[size1],%%rcx\n"
" testl %%ecx,%%ecx\n"
" jz 2f\n"
"1: movb %b[zero],(%[dst])\n"
" incq %[dst]\n"
" decl %%ecx ; jnz 1b\n"
"2:\n"
".section .fixup,\"ax\"\n"
"3: lea 0(%[size1],%[size8],8),%[size8]\n"
" jmp 2b\n"
".previous\n"
_ASM_EXTABLE(0b,3b)
_ASM_EXTABLE(1b,2b)
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
[zero] "r" (0UL), [eight] "r" (8UL));
return size;
}
EXPORT_SYMBOL(__clear_user);
unsigned long clear_user(void __user *to, unsigned long n)
{
if (access_ok(VERIFY_WRITE, to, n))
return __clear_user(to, n);
return n;
}
EXPORT_SYMBOL(clear_user);
/*
* Return the size of a string (including the ending 0)
*
* Return 0 on exception, a value greater than N if too long
*/
long __strnlen_user(const char __user *s, long n)
{
long res = 0;
char c;
while (1) {
if (res>n)
return n+1;
if (__get_user(c, s))
return 0;
if (!c)
return res+1;
res++;
s++;
}
}
EXPORT_SYMBOL(__strnlen_user);
long strnlen_user(const char __user *s, long n)
{
if (!access_ok(VERIFY_READ, s, 1))
return 0;
return __strnlen_user(s, n);
}
EXPORT_SYMBOL(strnlen_user);
long strlen_user(const char __user *s)
{
long res = 0;
char c;
for (;;) {
if (get_user(c, s))
return 0;
if (!c)
return res+1;
res++;
s++;
}
}
EXPORT_SYMBOL(strlen_user);
unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
{
if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
return copy_user_generic((__force void *)to, (__force void *)from, len);
}
return len;
}
EXPORT_SYMBOL(copy_in_user);
/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
* it is not necessary to optimize tail handling.
*/
unsigned long
copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
{
char c;
unsigned zero_len;
for (; len; --len) {
if (__get_user_nocheck(c, from++, sizeof(char)))
break;
if (__put_user_nocheck(c, to++, sizeof(char)))
break;
}
for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
if (__put_user_nocheck(c, to++, sizeof(char)))
break;
return len;
}
+955
View File
@@ -0,0 +1,955 @@
# x86 Opcode Maps
#
# This is (mostly) based on following documentations.
# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2
# (#325383-040US, October 2011)
# - Intel(R) Advanced Vector Extensions Programming Reference
# (#319433-011,JUNE 2011).
#
#<Opcode maps>
# Table: table-name
# Referrer: escaped-name
# AVXcode: avx-code
# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
# (or)
# opcode: escape # escaped-name
# EndTable
#
#<group maps>
# GrpTable: GrpXXX
# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
# EndTable
#
# AVX Superscripts
# (v): this opcode requires VEX prefix.
# (v1): this opcode only supports 128bit VEX.
#
# Last Prefix Superscripts
# - (66): the last prefix is 0x66
# - (F3): the last prefix is 0xF3
# - (F2): the last prefix is 0xF2
#
Table: one byte opcode
Referrer:
AVXcode:
# 0x00 - 0x0f
00: ADD Eb,Gb
01: ADD Ev,Gv
02: ADD Gb,Eb
03: ADD Gv,Ev
04: ADD AL,Ib
05: ADD rAX,Iz
06: PUSH ES (i64)
07: POP ES (i64)
08: OR Eb,Gb
09: OR Ev,Gv
0a: OR Gb,Eb
0b: OR Gv,Ev
0c: OR AL,Ib
0d: OR rAX,Iz
0e: PUSH CS (i64)
0f: escape # 2-byte escape
# 0x10 - 0x1f
10: ADC Eb,Gb
11: ADC Ev,Gv
12: ADC Gb,Eb
13: ADC Gv,Ev
14: ADC AL,Ib
15: ADC rAX,Iz
16: PUSH SS (i64)
17: POP SS (i64)
18: SBB Eb,Gb
19: SBB Ev,Gv
1a: SBB Gb,Eb
1b: SBB Gv,Ev
1c: SBB AL,Ib
1d: SBB rAX,Iz
1e: PUSH DS (i64)
1f: POP DS (i64)
# 0x20 - 0x2f
20: AND Eb,Gb
21: AND Ev,Gv
22: AND Gb,Eb
23: AND Gv,Ev
24: AND AL,Ib
25: AND rAx,Iz
26: SEG=ES (Prefix)
27: DAA (i64)
28: SUB Eb,Gb
29: SUB Ev,Gv
2a: SUB Gb,Eb
2b: SUB Gv,Ev
2c: SUB AL,Ib
2d: SUB rAX,Iz
2e: SEG=CS (Prefix)
2f: DAS (i64)
# 0x30 - 0x3f
30: XOR Eb,Gb
31: XOR Ev,Gv
32: XOR Gb,Eb
33: XOR Gv,Ev
34: XOR AL,Ib
35: XOR rAX,Iz
36: SEG=SS (Prefix)
37: AAA (i64)
38: CMP Eb,Gb
39: CMP Ev,Gv
3a: CMP Gb,Eb
3b: CMP Gv,Ev
3c: CMP AL,Ib
3d: CMP rAX,Iz
3e: SEG=DS (Prefix)
3f: AAS (i64)
# 0x40 - 0x4f
40: INC eAX (i64) | REX (o64)
41: INC eCX (i64) | REX.B (o64)
42: INC eDX (i64) | REX.X (o64)
43: INC eBX (i64) | REX.XB (o64)
44: INC eSP (i64) | REX.R (o64)
45: INC eBP (i64) | REX.RB (o64)
46: INC eSI (i64) | REX.RX (o64)
47: INC eDI (i64) | REX.RXB (o64)
48: DEC eAX (i64) | REX.W (o64)
49: DEC eCX (i64) | REX.WB (o64)
4a: DEC eDX (i64) | REX.WX (o64)
4b: DEC eBX (i64) | REX.WXB (o64)
4c: DEC eSP (i64) | REX.WR (o64)
4d: DEC eBP (i64) | REX.WRB (o64)
4e: DEC eSI (i64) | REX.WRX (o64)
4f: DEC eDI (i64) | REX.WRXB (o64)
# 0x50 - 0x5f
50: PUSH rAX/r8 (d64)
51: PUSH rCX/r9 (d64)
52: PUSH rDX/r10 (d64)
53: PUSH rBX/r11 (d64)
54: PUSH rSP/r12 (d64)
55: PUSH rBP/r13 (d64)
56: PUSH rSI/r14 (d64)
57: PUSH rDI/r15 (d64)
58: POP rAX/r8 (d64)
59: POP rCX/r9 (d64)
5a: POP rDX/r10 (d64)
5b: POP rBX/r11 (d64)
5c: POP rSP/r12 (d64)
5d: POP rBP/r13 (d64)
5e: POP rSI/r14 (d64)
5f: POP rDI/r15 (d64)
# 0x60 - 0x6f
60: PUSHA/PUSHAD (i64)
61: POPA/POPAD (i64)
62: BOUND Gv,Ma (i64)
63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
64: SEG=FS (Prefix)
65: SEG=GS (Prefix)
66: Operand-Size (Prefix)
67: Address-Size (Prefix)
68: PUSH Iz (d64)
69: IMUL Gv,Ev,Iz
6a: PUSH Ib (d64)
6b: IMUL Gv,Ev,Ib
6c: INS/INSB Yb,DX
6d: INS/INSW/INSD Yz,DX
6e: OUTS/OUTSB DX,Xb
6f: OUTS/OUTSW/OUTSD DX,Xz
# 0x70 - 0x7f
70: JO Jb
71: JNO Jb
72: JB/JNAE/JC Jb
73: JNB/JAE/JNC Jb
74: JZ/JE Jb
75: JNZ/JNE Jb
76: JBE/JNA Jb
77: JNBE/JA Jb
78: JS Jb
79: JNS Jb
7a: JP/JPE Jb
7b: JNP/JPO Jb
7c: JL/JNGE Jb
7d: JNL/JGE Jb
7e: JLE/JNG Jb
7f: JNLE/JG Jb
# 0x80 - 0x8f
80: Grp1 Eb,Ib (1A)
81: Grp1 Ev,Iz (1A)
82: Grp1 Eb,Ib (1A),(i64)
83: Grp1 Ev,Ib (1A)
84: TEST Eb,Gb
85: TEST Ev,Gv
86: XCHG Eb,Gb
87: XCHG Ev,Gv
88: MOV Eb,Gb
89: MOV Ev,Gv
8a: MOV Gb,Eb
8b: MOV Gv,Ev
8c: MOV Ev,Sw
8d: LEA Gv,M
8e: MOV Sw,Ew
8f: Grp1A (1A) | POP Ev (d64)
# 0x90 - 0x9f
90: NOP | PAUSE (F3) | XCHG r8,rAX
91: XCHG rCX/r9,rAX
92: XCHG rDX/r10,rAX
93: XCHG rBX/r11,rAX
94: XCHG rSP/r12,rAX
95: XCHG rBP/r13,rAX
96: XCHG rSI/r14,rAX
97: XCHG rDI/r15,rAX
98: CBW/CWDE/CDQE
99: CWD/CDQ/CQO
9a: CALLF Ap (i64)
9b: FWAIT/WAIT
9c: PUSHF/D/Q Fv (d64)
9d: POPF/D/Q Fv (d64)
9e: SAHF
9f: LAHF
# 0xa0 - 0xaf
a0: MOV AL,Ob
a1: MOV rAX,Ov
a2: MOV Ob,AL
a3: MOV Ov,rAX
a4: MOVS/B Yb,Xb
a5: MOVS/W/D/Q Yv,Xv
a6: CMPS/B Xb,Yb
a7: CMPS/W/D Xv,Yv
a8: TEST AL,Ib
a9: TEST rAX,Iz
aa: STOS/B Yb,AL
ab: STOS/W/D/Q Yv,rAX
ac: LODS/B AL,Xb
ad: LODS/W/D/Q rAX,Xv
ae: SCAS/B AL,Yb
# Note: The May 2011 Intel manual shows Xv for the second parameter of the
# next instruction but Yv is correct
af: SCAS/W/D/Q rAX,Yv
# 0xb0 - 0xbf
b0: MOV AL/R8L,Ib
b1: MOV CL/R9L,Ib
b2: MOV DL/R10L,Ib
b3: MOV BL/R11L,Ib
b4: MOV AH/R12L,Ib
b5: MOV CH/R13L,Ib
b6: MOV DH/R14L,Ib
b7: MOV BH/R15L,Ib
b8: MOV rAX/r8,Iv
b9: MOV rCX/r9,Iv
ba: MOV rDX/r10,Iv
bb: MOV rBX/r11,Iv
bc: MOV rSP/r12,Iv
bd: MOV rBP/r13,Iv
be: MOV rSI/r14,Iv
bf: MOV rDI/r15,Iv
# 0xc0 - 0xcf
c0: Grp2 Eb,Ib (1A)
c1: Grp2 Ev,Ib (1A)
c2: RETN Iw (f64)
c3: RETN
c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
c6: Grp11 Eb,Ib (1A)
c7: Grp11 Ev,Iz (1A)
c8: ENTER Iw,Ib
c9: LEAVE (d64)
ca: RETF Iw
cb: RETF
cc: INT3
cd: INT Ib
ce: INTO (i64)
cf: IRET/D/Q
# 0xd0 - 0xdf
d0: Grp2 Eb,1 (1A)
d1: Grp2 Ev,1 (1A)
d2: Grp2 Eb,CL (1A)
d3: Grp2 Ev,CL (1A)
d4: AAM Ib (i64)
d5: AAD Ib (i64)
d6:
d7: XLAT/XLATB
d8: ESC
d9: ESC
da: ESC
db: ESC
dc: ESC
dd: ESC
de: ESC
df: ESC
# 0xe0 - 0xef
e0: LOOPNE/LOOPNZ Jb (f64)
e1: LOOPE/LOOPZ Jb (f64)
e2: LOOP Jb (f64)
e3: JrCXZ Jb (f64)
e4: IN AL,Ib
e5: IN eAX,Ib
e6: OUT Ib,AL
e7: OUT Ib,eAX
e8: CALL Jz (f64)
e9: JMP-near Jz (f64)
ea: JMP-far Ap (i64)
eb: JMP-short Jb (f64)
ec: IN AL,DX
ed: IN eAX,DX
ee: OUT DX,AL
ef: OUT DX,eAX
# 0xf0 - 0xff
f0: LOCK (Prefix)
f1:
f2: REPNE (Prefix)
f3: REP/REPE (Prefix)
f4: HLT
f5: CMC
f6: Grp3_1 Eb (1A)
f7: Grp3_2 Ev (1A)
f8: CLC
f9: STC
fa: CLI
fb: STI
fc: CLD
fd: STD
fe: Grp4 (1A)
ff: Grp5 (1A)
EndTable
Table: 2-byte opcode (0x0f)
Referrer: 2-byte escape
AVXcode: 1
# 0x0f 0x00-0x0f
00: Grp6 (1A)
01: Grp7 (1A)
02: LAR Gv,Ew
03: LSL Gv,Ew
04:
05: SYSCALL (o64)
06: CLTS
07: SYSRET (o64)
08: INVD
09: WBINVD
0a:
0b: UD2 (1B)
0c:
0d: NOP Ev | GrpP
0e: FEMMS
# 3DNow! uses the last imm byte as opcode extension.
0f: 3DNow! Pq,Qq,Ib
# 0x0f 0x10-0x1f
# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
# Reference A.1
10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
18: Grp16 (1A)
19:
1a:
1b:
1c:
1d:
1e:
1f: NOP Ev
# 0x0f 0x20-0x2f
20: MOV Rd,Cd
21: MOV Rd,Dd
22: MOV Cd,Rd
23: MOV Dd,Rd
24:
25:
26:
27:
28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
# 0x0f 0x30-0x3f
30: WRMSR
31: RDTSC
32: RDMSR
33: RDPMC
34: SYSENTER
35: SYSEXIT
36:
37: GETSEC
38: escape # 3-byte escape 1
39:
3a: escape # 3-byte escape 2
3b:
3c:
3d:
3e:
3f:
# 0x0f 0x40-0x4f
40: CMOVO Gv,Ev
41: CMOVNO Gv,Ev
42: CMOVB/C/NAE Gv,Ev
43: CMOVAE/NB/NC Gv,Ev
44: CMOVE/Z Gv,Ev
45: CMOVNE/NZ Gv,Ev
46: CMOVBE/NA Gv,Ev
47: CMOVA/NBE Gv,Ev
48: CMOVS Gv,Ev
49: CMOVNS Gv,Ev
4a: CMOVP/PE Gv,Ev
4b: CMOVNP/PO Gv,Ev
4c: CMOVL/NGE Gv,Ev
4d: CMOVNL/GE Gv,Ev
4e: CMOVLE/NG Gv,Ev
4f: CMOVNLE/G Gv,Ev
# 0x0f 0x50-0x5f
50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
# 0x0f 0x60-0x6f
60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
6c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
6d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3)
# 0x0f 0x70-0x7f
70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
71: Grp12 (1A)
72: Grp13 (1A)
73: Grp14 (1A)
74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
77: emms | vzeroupper | vzeroall
78: VMREAD Ey,Gy
79: VMWRITE Gy,Ey
7a:
7b:
7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
# 0x0f 0x80-0x8f
80: JO Jz (f64)
81: JNO Jz (f64)
82: JB/JC/JNAE Jz (f64)
83: JAE/JNB/JNC Jz (f64)
84: JE/JZ Jz (f64)
85: JNE/JNZ Jz (f64)
86: JBE/JNA Jz (f64)
87: JA/JNBE Jz (f64)
88: JS Jz (f64)
89: JNS Jz (f64)
8a: JP/JPE Jz (f64)
8b: JNP/JPO Jz (f64)
8c: JL/JNGE Jz (f64)
8d: JNL/JGE Jz (f64)
8e: JLE/JNG Jz (f64)
8f: JNLE/JG Jz (f64)
# 0x0f 0x90-0x9f
90: SETO Eb
91: SETNO Eb
92: SETB/C/NAE Eb
93: SETAE/NB/NC Eb
94: SETE/Z Eb
95: SETNE/NZ Eb
96: SETBE/NA Eb
97: SETA/NBE Eb
98: SETS Eb
99: SETNS Eb
9a: SETP/PE Eb
9b: SETNP/PO Eb
9c: SETL/NGE Eb
9d: SETNL/GE Eb
9e: SETLE/NG Eb
9f: SETNLE/G Eb
# 0x0f 0xa0-0xaf
a0: PUSH FS (d64)
a1: POP FS (d64)
a2: CPUID
a3: BT Ev,Gv
a4: SHLD Ev,Gv,Ib
a5: SHLD Ev,Gv,CL
a6: GrpPDLK
a7: GrpRNG
a8: PUSH GS (d64)
a9: POP GS (d64)
aa: RSM
ab: BTS Ev,Gv
ac: SHRD Ev,Gv,Ib
ad: SHRD Ev,Gv,CL
ae: Grp15 (1A),(1C)
af: IMUL Gv,Ev
# 0x0f 0xb0-0xbf
b0: CMPXCHG Eb,Gb
b1: CMPXCHG Ev,Gv
b2: LSS Gv,Mp
b3: BTR Ev,Gv
b4: LFS Gv,Mp
b5: LGS Gv,Mp
b6: MOVZX Gv,Eb
b7: MOVZX Gv,Ew
b8: JMPE | POPCNT Gv,Ev (F3)
b9: Grp10 (1A)
ba: Grp8 Ev,Ib (1A)
bb: BTC Ev,Gv
bc: BSF Gv,Ev | TZCNT Gv,Ev (F3)
bd: BSR Gv,Ev | LZCNT Gv,Ev (F3)
be: MOVSX Gv,Eb
bf: MOVSX Gv,Ew
# 0x0f 0xc0-0xcf
c0: XADD Eb,Gb
c1: XADD Ev,Gv
c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
c3: movnti My,Gy
c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
c7: Grp9 (1A)
c8: BSWAP RAX/EAX/R8/R8D
c9: BSWAP RCX/ECX/R9/R9D
ca: BSWAP RDX/EDX/R10/R10D
cb: BSWAP RBX/EBX/R11/R11D
cc: BSWAP RSP/ESP/R12/R12D
cd: BSWAP RBP/EBP/R13/R13D
ce: BSWAP RSI/ESI/R14/R14D
cf: BSWAP RDI/EDI/R15/R15D
# 0x0f 0xd0-0xdf
d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1)
dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1)
# 0x0f 0xe0-0xef
e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2)
e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1)
ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1)
# 0x0f 0xf0-0xff
f0: vlddqu Vx,Mx (F2)
f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
ff:
EndTable
Table: 3-byte opcode 1 (0x0f 0x38)
Referrer: 3-byte escape 1
AVXcode: 2
# 0x0f 0x38 0x00-0x0f
00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
0c: vpermilps Vx,Hx,Wx (66),(v)
0d: vpermilpd Vx,Hx,Wx (66),(v)
0e: vtestps Vx,Wx (66),(v)
0f: vtestpd Vx,Wx (66),(v)
# 0x0f 0x38 0x10-0x1f
10: pblendvb Vdq,Wdq (66)
11:
12:
13: vcvtph2ps Vx,Wx,Ib (66),(v)
14: blendvps Vdq,Wdq (66)
15: blendvpd Vdq,Wdq (66)
16: vpermps Vqq,Hqq,Wqq (66),(v)
17: vptest Vx,Wx (66)
18: vbroadcastss Vx,Wd (66),(v)
19: vbroadcastsd Vqq,Wq (66),(v)
1a: vbroadcastf128 Vqq,Mdq (66),(v)
1b:
1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
1f:
# 0x0f 0x38 0x20-0x2f
20: vpmovsxbw Vx,Ux/Mq (66),(v1)
21: vpmovsxbd Vx,Ux/Md (66),(v1)
22: vpmovsxbq Vx,Ux/Mw (66),(v1)
23: vpmovsxwd Vx,Ux/Mq (66),(v1)
24: vpmovsxwq Vx,Ux/Md (66),(v1)
25: vpmovsxdq Vx,Ux/Mq (66),(v1)
26:
27:
28: vpmuldq Vx,Hx,Wx (66),(v1)
29: vpcmpeqq Vx,Hx,Wx (66),(v1)
2a: vmovntdqa Vx,Mx (66),(v1)
2b: vpackusdw Vx,Hx,Wx (66),(v1)
2c: vmaskmovps Vx,Hx,Mx (66),(v)
2d: vmaskmovpd Vx,Hx,Mx (66),(v)
2e: vmaskmovps Mx,Hx,Vx (66),(v)
2f: vmaskmovpd Mx,Hx,Vx (66),(v)
# 0x0f 0x38 0x30-0x3f
30: vpmovzxbw Vx,Ux/Mq (66),(v1)
31: vpmovzxbd Vx,Ux/Md (66),(v1)
32: vpmovzxbq Vx,Ux/Mw (66),(v1)
33: vpmovzxwd Vx,Ux/Mq (66),(v1)
34: vpmovzxwq Vx,Ux/Md (66),(v1)
35: vpmovzxdq Vx,Ux/Mq (66),(v1)
36: vpermd Vqq,Hqq,Wqq (66),(v)
37: vpcmpgtq Vx,Hx,Wx (66),(v1)
38: vpminsb Vx,Hx,Wx (66),(v1)
39: vpminsd Vx,Hx,Wx (66),(v1)
3a: vpminuw Vx,Hx,Wx (66),(v1)
3b: vpminud Vx,Hx,Wx (66),(v1)
3c: vpmaxsb Vx,Hx,Wx (66),(v1)
3d: vpmaxsd Vx,Hx,Wx (66),(v1)
3e: vpmaxuw Vx,Hx,Wx (66),(v1)
3f: vpmaxud Vx,Hx,Wx (66),(v1)
# 0x0f 0x38 0x40-0x8f
40: vpmulld Vx,Hx,Wx (66),(v1)
41: vphminposuw Vdq,Wdq (66),(v1)
42:
43:
44:
45: vpsrlvd/q Vx,Hx,Wx (66),(v)
46: vpsravd Vx,Hx,Wx (66),(v)
47: vpsllvd/q Vx,Hx,Wx (66),(v)
# Skip 0x48-0x57
58: vpbroadcastd Vx,Wx (66),(v)
59: vpbroadcastq Vx,Wx (66),(v)
5a: vbroadcasti128 Vqq,Mdq (66),(v)
# Skip 0x5b-0x77
78: vpbroadcastb Vx,Wx (66),(v)
79: vpbroadcastw Vx,Wx (66),(v)
# Skip 0x7a-0x7f
80: INVEPT Gy,Mdq (66)
81: INVPID Gy,Mdq (66)
82: INVPCID Gy,Mdq (66)
8c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
8e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
# 0x0f 0x38 0x90-0xbf (FMA)
90: vgatherdd/q Vx,Hx,Wx (66),(v)
91: vgatherqd/q Vx,Hx,Wx (66),(v)
92: vgatherdps/d Vx,Hx,Wx (66),(v)
93: vgatherqps/d Vx,Hx,Wx (66),(v)
94:
95:
96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
98: vfmadd132ps/d Vx,Hx,Wx (66),(v)
99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
9a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
# 0x0f 0x38 0xc0-0xff
db: VAESIMC Vdq,Wdq (66),(v1)
dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
f6: MULX By,Gy,rDX,Ey (F2),(v)
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
Table: 3-byte opcode 2 (0x0f 0x3a)
Referrer: 3-byte escape 2
AVXcode: 3
# 0x0f 0x3a 0x00-0xff
00: vpermq Vqq,Wqq,Ib (66),(v)
01: vpermpd Vqq,Wqq,Ib (66),(v)
02: vpblendd Vx,Hx,Wx,Ib (66),(v)
03:
04: vpermilps Vx,Wx,Ib (66),(v)
05: vpermilpd Vx,Wx,Ib (66),(v)
06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
07:
08: vroundps Vx,Wx,Ib (66)
09: vroundpd Vx,Wx,Ib (66)
0a: vroundss Vss,Wss,Ib (66),(v1)
0b: vroundsd Vsd,Wsd,Ib (66),(v1)
0c: vblendps Vx,Hx,Wx,Ib (66)
0d: vblendpd Vx,Hx,Wx,Ib (66)
0e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
14: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
15: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
16: vpextrd/q Ey,Vdq,Ib (66),(v1)
17: vextractps Ed,Vdq,Ib (66),(v1)
18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v)
19: vextractf128 Wdq,Vqq,Ib (66),(v)
1d: vcvtps2ph Wx,Vx,Ib (66),(v)
20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v)
39: vextracti128 Wdq,Vqq,Ib (66),(v)
40: vdpps Vx,Hx,Wx,Ib (66)
41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1)
44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
4a: vblendvps Vx,Hx,Wx,Lx (66),(v)
4b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
60: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
f0: RORX Gy,Ey,Ib (F2),(v)
EndTable
GrpTable: Grp1
0: ADD
1: OR
2: ADC
3: SBB
4: AND
5: SUB
6: XOR
7: CMP
EndTable
GrpTable: Grp1A
0: POP
EndTable
GrpTable: Grp2
0: ROL
1: ROR
2: RCL
3: RCR
4: SHL/SAL
5: SHR
6:
7: SAR
EndTable
GrpTable: Grp3_1
0: TEST Eb,Ib
1:
2: NOT Eb
3: NEG Eb
4: MUL AL,Eb
5: IMUL AL,Eb
6: DIV AL,Eb
7: IDIV AL,Eb
EndTable
GrpTable: Grp3_2
0: TEST Ev,Iz
1:
2: NOT Ev
3: NEG Ev
4: MUL rAX,Ev
5: IMUL rAX,Ev
6: DIV rAX,Ev
7: IDIV rAX,Ev
EndTable
GrpTable: Grp4
0: INC Eb
1: DEC Eb
EndTable
GrpTable: Grp5
0: INC Ev
1: DEC Ev
2: CALLN Ev (f64)
3: CALLF Ep
4: JMPN Ev (f64)
5: JMPF Mp
6: PUSH Ev (d64)
7:
EndTable
GrpTable: Grp6
0: SLDT Rv/Mw
1: STR Rv/Mw
2: LLDT Ew
3: LTR Ew
4: VERR Ew
5: VERW Ew
EndTable
GrpTable: Grp7
0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B)
3: LIDT Ms
4: SMSW Mw/Rv
5:
6: LMSW Ew
7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
EndTable
GrpTable: Grp8
4: BT
5: BTS
6: BTR
7: BTC
EndTable
GrpTable: Grp9
1: CMPXCHG8B/16B Mq/Mdq
6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
7: VMPTRST Mq | VMPTRST Mq (F3)
EndTable
GrpTable: Grp10
EndTable
GrpTable: Grp11
# Note: the operands are given by group opcode
0: MOV
EndTable
GrpTable: Grp12
2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
EndTable
GrpTable: Grp13
2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1)
6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
EndTable
GrpTable: Grp14
2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
3: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
7: vpslldq Hx,Ux,Ib (66),(11B),(v1)
EndTable
GrpTable: Grp15
0: fxsave | RDFSBASE Ry (F3),(11B)
1: fxstor | RDGSBASE Ry (F3),(11B)
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE
5: XRSTOR | lfence (11B)
6: XSAVEOPT | mfence (11B)
7: clflush | sfence (11B)
EndTable
GrpTable: Grp16
0: prefetch NTA
1: prefetch T0
2: prefetch T1
3: prefetch T2
EndTable
GrpTable: Grp17
1: BLSR By,Ey (v)
2: BLSMSK By,Ey (v)
3: BLSI By,Ey (v)
EndTable
# AMD's Prefetch Group
GrpTable: GrpP
0: PREFETCH
1: PREFETCHW
EndTable
GrpTable: GrpPDLK
0: MONTMUL
1: XSHA1
2: XSHA2
EndTable
GrpTable: GrpRNG
0: xstore-rng
1: xcrypt-ecb
2: xcrypt-cbc
4: xcrypt-cfb
5: xcrypt-ofb
EndTable