Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next

Pull sparc updates from David Miller: "Largely this is simply adding support for the Niagara 4 cpu. Major areas are perf events (chip now supports 4 counters and can monitor any event on each counter), crypto (opcodes are availble for sha1, sha256, sha512, md5, crc32c, AES, DES, CAMELLIA, and Kasumi although the last is unsupported since we lack a generic crypto layer Kasumi implementation), and an optimized memcpy. Finally some cleanups by Peter Senna Tschudin." * git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next: (47 commits) sparc64: Fix trailing whitespace in NG4 memcpy. sparc64: Fix comment type in NG4 copy from user. sparc64: Add SPARC-T4 optimized memcpy. drivers/sbus/char: removes unnecessary semicolon arch/sparc/kernel/pci_sun4v.c: removes unnecessary semicolon sparc64: Fix function argument comment in camellia_sparc64_key_expand asm. sparc64: Fix IV handling bug in des_sparc64_cbc_decrypt sparc64: Add auto-loading mechanism to crypto-opcode drivers. sparc64: Add missing pr_fmt define to crypto opcode drivers. sparc64: Adjust crypto priorities. sparc64: Use cpu_pgsz_mask for linear kernel mapping config. sparc64: Probe cpu page size support more portably. sparc64: Support 2GB and 16GB page sizes for kernel linear mappings. sparc64: Fix bugs in unrolled 256-bit loops. sparc64: Avoid code duplication in crypto assembler. sparc64: Unroll CTR crypt loops in AES driver. sparc64: Unroll ECB decryption loops in AES driver. sparc64: Unroll ECB encryption loops in AES driver. sparc64: Add ctr mode support to AES driver. sparc64: Move AES driver over to a methods based implementation. ...

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next
Pull sparc updates from David Miller: "Largely this is simply adding support for the Niagara 4 cpu. Major areas are perf events (chip now supports 4 counters and can monitor any event on each counter), crypto (opcodes are availble for sha1, sha256, sha512, md5, crc32c, AES, DES, CAMELLIA, and Kasumi although the last is unsupported since we lack a generic crypto layer Kasumi implementation), and an optimized memcpy. Finally some cleanups by Peter Senna Tschudin." * git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next: (47 commits) sparc64: Fix trailing whitespace in NG4 memcpy. sparc64: Fix comment type in NG4 copy from user. sparc64: Add SPARC-T4 optimized memcpy. drivers/sbus/char: removes unnecessary semicolon arch/sparc/kernel/pci_sun4v.c: removes unnecessary semicolon sparc64: Fix function argument comment in camellia_sparc64_key_expand asm. sparc64: Fix IV handling bug in des_sparc64_cbc_decrypt sparc64: Add auto-loading mechanism to crypto-opcode drivers. sparc64: Add missing pr_fmt define to crypto opcode drivers. sparc64: Adjust crypto priorities. sparc64: Use cpu_pgsz_mask for linear kernel mapping config. sparc64: Probe cpu page size support more portably. sparc64: Support 2GB and 16GB page sizes for kernel linear mappings. sparc64: Fix bugs in unrolled 256-bit loops. sparc64: Avoid code duplication in crypto assembler. sparc64: Unroll CTR crypt loops in AES driver. sparc64: Unroll ECB decryption loops in AES driver. sparc64: Unroll ECB encryption loops in AES driver. sparc64: Add ctr mode support to AES driver. sparc64: Move AES driver over to a methods based implementation. ...
a20acf99 · Linus Torvalds · 437589a7 · 42a4172b · a20acf99 · a20acf99
Commit a20acf99 authored Oct 02, 2012 by Linus Torvalds
52 changed files
--- a/arch/sparc/Kbuild
+++ b/arch/sparc/Kbuild
@@ -6,3 +6,4 @@ obj-y += kernel/
 obj-y += mm/
 obj-y += math-emu/
 obj-y += net/
+obj-y += crypto/
--- a/arch/sparc/crypto/Makefile
+++ b/arch/sparc/crypto/Makefile
+#
+# Arch-specific CryptoAPI modules.
+#
+
+obj-$(CONFIG_CRYPTO_SHA1_SPARC64) += sha1-sparc64.o
+obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o
+obj-$(CONFIG_CRYPTO_SHA512_SPARC64) += sha512-sparc64.o
+obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o
+
+obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o
+obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o
+obj-$(CONFIG_CRYPTO_DES_SPARC64) += camellia-sparc64.o
+
+obj-$(CONFIG_CRYPTO_CRC32C_SPARC64) += crc32c-sparc64.o
+
+sha1-sparc64-y := sha1_asm.o sha1_glue.o crop_devid.o
+sha256-sparc64-y := sha256_asm.o sha256_glue.o crop_devid.o
+sha512-sparc64-y := sha512_asm.o sha512_glue.o crop_devid.o
+md5-sparc64-y := md5_asm.o md5_glue.o crop_devid.o
+
+aes-sparc64-y := aes_asm.o aes_glue.o crop_devid.o
+des-sparc64-y := des_asm.o des_glue.o crop_devid.o
+camellia-sparc64-y := camellia_asm.o camellia_glue.o crop_devid.o
+
+crc32c-sparc64-y := crc32c_asm.o crc32c_glue.o crop_devid.o
--- a/arch/sparc/crypto/aes_asm.S
+++ b/arch/sparc/crypto/aes_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+#define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23(KEY_BASE +  6, T0, T1, I1)
+
+#define ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01(KEY_BASE +  0, I2, I3, T2) \
+	AES_EROUND23(KEY_BASE +  2, I2, I3, T3) \
+	AES_EROUND01(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23(KEY_BASE +  6, T0, T1, I1) \
+	AES_EROUND01(KEY_BASE +  4, T2, T3, I2) \
+	AES_EROUND23(KEY_BASE +  6, T2, T3, I3)
+
+#define ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01_L(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23_L(KEY_BASE +  6, T0, T1, I1)
+
+#define ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01(KEY_BASE +  0, I2, I3, T2) \
+	AES_EROUND23(KEY_BASE +  2, I2, I3, T3) \
+	AES_EROUND01_L(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23_L(KEY_BASE +  6, T0, T1, I1) \
+	AES_EROUND01_L(KEY_BASE +  4, T2, T3, I2) \
+	AES_EROUND23_L(KEY_BASE +  6, T2, T3, I3)
+
+	/* 10 rounds */
+#define ENCRYPT_128(KEY_BASE, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1)
+
+#define ENCRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 12 rounds */
+#define ENCRYPT_192(KEY_BASE, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1)
+
+#define ENCRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 14 rounds */
+#define ENCRYPT_256(KEY_BASE, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1)
+
+#define ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \
+			     TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE + 6)
+
+#define ENCRYPT_256_2(KEY_BASE, I0, I1, I2, I3) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, KEY_BASE + 48) \
+	ldd	[%o0 + 0xd0], %f56; \
+	ldd	[%o0 + 0xd8], %f58; \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, KEY_BASE +  0) \
+	ldd	[%o0 + 0xe0], %f60; \
+	ldd	[%o0 + 0xe8], %f62; \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE +  0) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE +  0) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE +  0) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE +  0) \
+	AES_EROUND01(KEY_BASE +  48, I0, I1, KEY_BASE + 0) \
+	AES_EROUND23(KEY_BASE +  50, I0, I1, KEY_BASE + 2) \
+	AES_EROUND01(KEY_BASE +  48, I2, I3, KEY_BASE + 4) \
+	AES_EROUND23(KEY_BASE +  50, I2, I3, KEY_BASE + 6) \
+	AES_EROUND01_L(KEY_BASE +  52, KEY_BASE + 0, KEY_BASE + 2, I0) \
+	AES_EROUND23_L(KEY_BASE +  54, KEY_BASE + 0, KEY_BASE + 2, I1) \
+	ldd	[%o0 + 0x10], %f8; \
+	ldd	[%o0 + 0x18], %f10; \
+	AES_EROUND01_L(KEY_BASE +  52, KEY_BASE + 4, KEY_BASE + 6, I2) \
+	AES_EROUND23_L(KEY_BASE +  54, KEY_BASE + 4, KEY_BASE + 6, I3) \
+	ldd	[%o0 + 0x20], %f12; \
+	ldd	[%o0 + 0x28], %f14;
+
+#define DECRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01(KEY_BASE +  6, T0, T1, I0)
+
+#define DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23(KEY_BASE +  0, I2, I3, T3) \
+	AES_DROUND01(KEY_BASE +  2, I2, I3, T2) \
+	AES_DROUND23(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01(KEY_BASE +  6, T0, T1, I0) \
+	AES_DROUND23(KEY_BASE +  4, T2, T3, I3) \
+	AES_DROUND01(KEY_BASE +  6, T2, T3, I2)
+
+#define DECRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23_L(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01_L(KEY_BASE +  6, T0, T1, I0)
+
+#define DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23(KEY_BASE +  0, I2, I3, T3) \
+	AES_DROUND01(KEY_BASE +  2, I2, I3, T2) \
+	AES_DROUND23_L(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01_L(KEY_BASE +  6, T0, T1, I0) \
+	AES_DROUND23_L(KEY_BASE +  4, T2, T3, I3) \
+	AES_DROUND01_L(KEY_BASE +  6, T2, T3, I2)
+
+	/* 10 rounds */
+#define DECRYPT_128(KEY_BASE, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1)
+
+#define DECRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 12 rounds */
+#define DECRYPT_192(KEY_BASE, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1)
+
+#define DECRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 14 rounds */
+#define DECRYPT_256(KEY_BASE, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1)
+
+#define DECRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \
+			     TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE + 6)
+
+#define DECRYPT_256_2(KEY_BASE, I0, I1, I2, I3) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, KEY_BASE + 48) \
+	ldd	[%o0 + 0x18], %f56; \
+	ldd	[%o0 + 0x10], %f58; \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, KEY_BASE +  0) \
+	ldd	[%o0 + 0x08], %f60; \
+	ldd	[%o0 + 0x00], %f62; \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE +  0) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE +  0) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE +  0) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE +  0) \
+	AES_DROUND23(KEY_BASE +  48, I0, I1, KEY_BASE + 2) \
+	AES_DROUND01(KEY_BASE +  50, I0, I1, KEY_BASE + 0) \
+	AES_DROUND23(KEY_BASE +  48, I2, I3, KEY_BASE + 6) \
+	AES_DROUND01(KEY_BASE +  50, I2, I3, KEY_BASE + 4) \
+	AES_DROUND23_L(KEY_BASE +  52, KEY_BASE + 0, KEY_BASE + 2, I1) \
+	AES_DROUND01_L(KEY_BASE +  54, KEY_BASE + 0, KEY_BASE + 2, I0) \
+	ldd	[%o0 + 0xd8], %f8; \
+	ldd	[%o0 + 0xd0], %f10; \
+	AES_DROUND23_L(KEY_BASE +  52, KEY_BASE + 4, KEY_BASE + 6, I3) \
+	AES_DROUND01_L(KEY_BASE +  54, KEY_BASE + 4, KEY_BASE + 6, I2) \
+	ldd	[%o0 + 0xc8], %f12; \
+	ldd	[%o0 + 0xc0], %f14;
+
+	.align	32
+ENTRY(aes_sparc64_key_expand)
+	/* %o0=input_key, %o1=output_key, %o2=key_len */
+	VISEntry
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	ld	[%o0 + 0x0c], %f3
+
+	std	%f0, [%o1 + 0x00]
+	std	%f2, [%o1 + 0x08]
+	add	%o1, 0x10, %o1
+
+	cmp	%o2, 24
+	bl	2f
+	 nop
+
+	be	1f
+	 nop
+
+	/* 256-bit key expansion */
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+	ld	[%o0 + 0x18], %f6
+	ld	[%o0 + 0x1c], %f7
+
+	std	%f4, [%o1 + 0x00]
+	std	%f6, [%o1 + 0x08]
+	add	%o1, 0x10, %o1
+
+	AES_KEXPAND1(0, 6, 0x0, 8)
+	AES_KEXPAND2(2, 8, 10)
+	AES_KEXPAND0(4, 10, 12)
+	AES_KEXPAND2(6, 12, 14)
+	AES_KEXPAND1(8, 14, 0x1, 16)
+	AES_KEXPAND2(10, 16, 18)
+	AES_KEXPAND0(12, 18, 20)
+	AES_KEXPAND2(14, 20, 22)
+	AES_KEXPAND1(16, 22, 0x2, 24)
+	AES_KEXPAND2(18, 24, 26)
+	AES_KEXPAND0(20, 26, 28)
+	AES_KEXPAND2(22, 28, 30)
+	AES_KEXPAND1(24, 30, 0x3, 32)
+	AES_KEXPAND2(26, 32, 34)
+	AES_KEXPAND0(28, 34, 36)
+	AES_KEXPAND2(30, 36, 38)
+	AES_KEXPAND1(32, 38, 0x4, 40)
+	AES_KEXPAND2(34, 40, 42)
+	AES_KEXPAND0(36, 42, 44)
+	AES_KEXPAND2(38, 44, 46)
+	AES_KEXPAND1(40, 46, 0x5, 48)
+	AES_KEXPAND2(42, 48, 50)
+	AES_KEXPAND0(44, 50, 52)
+	AES_KEXPAND2(46, 52, 54)
+	AES_KEXPAND1(48, 54, 0x6, 56)
+	AES_KEXPAND2(50, 56, 58)
+
+	std	%f8, [%o1 + 0x00]
+	std	%f10, [%o1 + 0x08]
+	std	%f12, [%o1 + 0x10]
+	std	%f14, [%o1 + 0x18]
+	std	%f16, [%o1 + 0x20]
+	std	%f18, [%o1 + 0x28]
+	std	%f20, [%o1 + 0x30]
+	std	%f22, [%o1 + 0x38]
+	std	%f24, [%o1 + 0x40]
+	std	%f26, [%o1 + 0x48]
+	std	%f28, [%o1 + 0x50]
+	std	%f30, [%o1 + 0x58]
+	std	%f32, [%o1 + 0x60]
+	std	%f34, [%o1 + 0x68]
+	std	%f36, [%o1 + 0x70]
+	std	%f38, [%o1 + 0x78]
+	std	%f40, [%o1 + 0x80]
+	std	%f42, [%o1 + 0x88]
+	std	%f44, [%o1 + 0x90]
+	std	%f46, [%o1 + 0x98]
+	std	%f48, [%o1 + 0xa0]
+	std	%f50, [%o1 + 0xa8]
+	std	%f52, [%o1 + 0xb0]
+	std	%f54, [%o1 + 0xb8]
+	std	%f56, [%o1 + 0xc0]
+	ba,pt	%xcc, 80f
+	 std	%f58, [%o1 + 0xc8]
+
+1:	
+	/* 192-bit key expansion */
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+
+	std	%f4, [%o1 + 0x00]
+	add	%o1, 0x08, %o1
+
+	AES_KEXPAND1(0, 4, 0x0, 6)
+	AES_KEXPAND2(2, 6, 8)
+	AES_KEXPAND2(4, 8, 10)
+	AES_KEXPAND1(6, 10, 0x1, 12)
+	AES_KEXPAND2(8, 12, 14)
+	AES_KEXPAND2(10, 14, 16)
+	AES_KEXPAND1(12, 16, 0x2, 18)
+	AES_KEXPAND2(14, 18, 20)
+	AES_KEXPAND2(16, 20, 22)
+	AES_KEXPAND1(18, 22, 0x3, 24)
+	AES_KEXPAND2(20, 24, 26)
+	AES_KEXPAND2(22, 26, 28)
+	AES_KEXPAND1(24, 28, 0x4, 30)
+	AES_KEXPAND2(26, 30, 32)
+	AES_KEXPAND2(28, 32, 34)
+	AES_KEXPAND1(30, 34, 0x5, 36)
+	AES_KEXPAND2(32, 36, 38)
+	AES_KEXPAND2(34, 38, 40)
+	AES_KEXPAND1(36, 40, 0x6, 42)
+	AES_KEXPAND2(38, 42, 44)
+	AES_KEXPAND2(40, 44, 46)
+	AES_KEXPAND1(42, 46, 0x7, 48)
+	AES_KEXPAND2(44, 48, 50)
+
+	std	%f6, [%o1 + 0x00]
+	std	%f8, [%o1 + 0x08]
+	std	%f10, [%o1 + 0x10]
+	std	%f12, [%o1 + 0x18]
+	std	%f14, [%o1 + 0x20]
+	std	%f16, [%o1 + 0x28]
+	std	%f18, [%o1 + 0x30]
+	std	%f20, [%o1 + 0x38]
+	std	%f22, [%o1 + 0x40]
+	std	%f24, [%o1 + 0x48]
+	std	%f26, [%o1 + 0x50]
+	std	%f28, [%o1 + 0x58]
+	std	%f30, [%o1 + 0x60]
+	std	%f32, [%o1 + 0x68]
+	std	%f34, [%o1 + 0x70]
+	std	%f36, [%o1 + 0x78]
+	std	%f38, [%o1 + 0x80]
+	std	%f40, [%o1 + 0x88]
+	std	%f42, [%o1 + 0x90]
+	std	%f44, [%o1 + 0x98]
+	std	%f46, [%o1 + 0xa0]
+	std	%f48, [%o1 + 0xa8]
+	ba,pt	%xcc, 80f
+	 std	%f50, [%o1 + 0xb0]
+
+2:
+	/* 128-bit key expansion */
+	AES_KEXPAND1(0, 2, 0x0, 4)
+	AES_KEXPAND2(2, 4, 6)
+	AES_KEXPAND1(4, 6, 0x1, 8)
+	AES_KEXPAND2(6, 8, 10)
+	AES_KEXPAND1(8, 10, 0x2, 12)
+	AES_KEXPAND2(10, 12, 14)
+	AES_KEXPAND1(12, 14, 0x3, 16)
+	AES_KEXPAND2(14, 16, 18)
+	AES_KEXPAND1(16, 18, 0x4, 20)
+	AES_KEXPAND2(18, 20, 22)
+	AES_KEXPAND1(20, 22, 0x5, 24)
+	AES_KEXPAND2(22, 24, 26)
+	AES_KEXPAND1(24, 26, 0x6, 28)
+	AES_KEXPAND2(26, 28, 30)
+	AES_KEXPAND1(28, 30, 0x7, 32)
+	AES_KEXPAND2(30, 32, 34)
+	AES_KEXPAND1(32, 34, 0x8, 36)
+	AES_KEXPAND2(34, 36, 38)
+	AES_KEXPAND1(36, 38, 0x9, 40)
+	AES_KEXPAND2(38, 40, 42)
+
+	std	%f4, [%o1 + 0x00]
+	std	%f6, [%o1 + 0x08]
+	std	%f8, [%o1 + 0x10]
+	std	%f10, [%o1 + 0x18]
+	std	%f12, [%o1 + 0x20]
+	std	%f14, [%o1 + 0x28]
+	std	%f16, [%o1 + 0x30]
+	std	%f18, [%o1 + 0x38]
+	std	%f20, [%o1 + 0x40]
+	std	%f22, [%o1 + 0x48]
+	std	%f24, [%o1 + 0x50]
+	std	%f26, [%o1 + 0x58]
+	std	%f28, [%o1 + 0x60]
+	std	%f30, [%o1 + 0x68]
+	std	%f32, [%o1 + 0x70]
+	std	%f34, [%o1 + 0x78]
+	std	%f36, [%o1 + 0x80]
+	std	%f38, [%o1 + 0x88]
+	std	%f40, [%o1 + 0x90]
+	std	%f42, [%o1 + 0x98]
+80:
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_key_expand)
+
+	.align		32
+ENTRY(aes_sparc64_encrypt_128)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0x00], %f8
+	ldd		[%o0 + 0x08], %f10
+	ldd		[%o0 + 0x10], %f12
+	ldd		[%o0 + 0x18], %f14
+	ldd		[%o0 + 0x20], %f16
+	ldd		[%o0 + 0x28], %f18
+	ldd		[%o0 + 0x30], %f20
+	ldd		[%o0 + 0x38], %f22
+	ldd		[%o0 + 0x40], %f24
+	ldd		[%o0 + 0x48], %f26
+	ldd		[%o0 + 0x50], %f28
+	ldd		[%o0 + 0x58], %f30
+	ldd		[%o0 + 0x60], %f32
+	ldd		[%o0 + 0x68], %f34
+	ldd		[%o0 + 0x70], %f36
+	ldd		[%o0 + 0x78], %f38
+	ldd		[%o0 + 0x80], %f40
+	ldd		[%o0 + 0x88], %f42
+	ldd		[%o0 + 0x90], %f44
+	ldd		[%o0 + 0x98], %f46
+	ldd		[%o0 + 0xa0], %f48
+	ldd		[%o0 + 0xa8], %f50
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+	ENCRYPT_128(12, 4, 6, 0, 2)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_encrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_encrypt_192)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+
+	ldd		[%o0 + 0x00], %f8
+	ldd		[%o0 + 0x08], %f10
+
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	add		%o0, 0x20, %o0
+
+	ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+	ldd		[%o0 + 0x10], %f12
+	ldd		[%o0 + 0x18], %f14
+	ldd		[%o0 + 0x20], %f16
+	ldd		[%o0 + 0x28], %f18
+	ldd		[%o0 + 0x30], %f20
+	ldd		[%o0 + 0x38], %f22
+	ldd		[%o0 + 0x40], %f24
+	ldd		[%o0 + 0x48], %f26
+	ldd		[%o0 + 0x50], %f28
+	ldd		[%o0 + 0x58], %f30
+	ldd		[%o0 + 0x60], %f32
+	ldd		[%o0 + 0x68], %f34
+	ldd		[%o0 + 0x70], %f36
+	ldd		[%o0 + 0x78], %f38
+	ldd		[%o0 + 0x80], %f40
+	ldd		[%o0 + 0x88], %f42
+	ldd		[%o0 + 0x90], %f44
+	ldd		[%o0 + 0x98], %f46
+	ldd		[%o0 + 0xa0], %f48
+	ldd		[%o0 + 0xa8], %f50
+
+
+	ENCRYPT_128(12, 4, 6, 0, 2)
+
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_encrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_encrypt_256)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+
+	ldd		[%o0 + 0x00], %f8
+	ldd		[%o0 + 0x08], %f10
+
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	ldd		[%o0 + 0x10], %f8
+
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	add		%o0, 0x20, %o0
+
+	ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+	ldd		[%o0 + 0x10], %f8
+
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	add		%o0, 0x20, %o0
+
+	ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+	ldd		[%o0 + 0x10], %f12
+	ldd		[%o0 + 0x18], %f14
+	ldd		[%o0 + 0x20], %f16
+	ldd		[%o0 + 0x28], %f18
+	ldd		[%o0 + 0x30], %f20
+	ldd		[%o0 + 0x38], %f22
+	ldd		[%o0 + 0x40], %f24
+	ldd		[%o0 + 0x48], %f26
+	ldd		[%o0 + 0x50], %f28
+	ldd		[%o0 + 0x58], %f30
+	ldd		[%o0 + 0x60], %f32
+	ldd		[%o0 + 0x68], %f34
+	ldd		[%o0 + 0x70], %f36
+	ldd		[%o0 + 0x78], %f38
+	ldd		[%o0 + 0x80], %f40
+	ldd		[%o0 + 0x88], %f42
+	ldd		[%o0 + 0x90], %f44
+	ldd		[%o0 + 0x98], %f46
+	ldd		[%o0 + 0xa0], %f48
+	ldd		[%o0 + 0xa8], %f50
+
+	ENCRYPT_128(12, 4, 6, 0, 2)
+
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_encrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_decrypt_128)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0xa0], %f8
+	ldd		[%o0 + 0xa8], %f10
+	ldd		[%o0 + 0x98], %f12
+	ldd		[%o0 + 0x90], %f14
+	ldd		[%o0 + 0x88], %f16
+	ldd		[%o0 + 0x80], %f18
+	ldd		[%o0 + 0x78], %f20
+	ldd		[%o0 + 0x70], %f22
+	ldd		[%o0 + 0x68], %f24
+	ldd		[%o0 + 0x60], %f26
+	ldd		[%o0 + 0x58], %f28
+	ldd		[%o0 + 0x50], %f30
+	ldd		[%o0 + 0x48], %f32
+	ldd		[%o0 + 0x40], %f34
+	ldd		[%o0 + 0x38], %f36
+	ldd		[%o0 + 0x30], %f38
+	ldd		[%o0 + 0x28], %f40
+	ldd		[%o0 + 0x20], %f42
+	ldd		[%o0 + 0x18], %f44
+	ldd		[%o0 + 0x10], %f46
+	ldd		[%o0 + 0x08], %f48
+	ldd		[%o0 + 0x00], %f50
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+	DECRYPT_128(12, 4, 6, 0, 2)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_decrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_decrypt_192)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0xc0], %f8
+	ldd		[%o0 + 0xc8], %f10
+	ldd		[%o0 + 0xb8], %f12
+	ldd		[%o0 + 0xb0], %f14
+	ldd		[%o0 + 0xa8], %f16
+	ldd		[%o0 + 0xa0], %f18
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+	ldd		[%o0 + 0x98], %f20
+	ldd		[%o0 + 0x90], %f22
+	ldd		[%o0 + 0x88], %f24
+	ldd		[%o0 + 0x80], %f26
+	DECRYPT_TWO_ROUNDS(12, 4, 6, 0, 2)
+	ldd		[%o0 + 0x78], %f28
+	ldd		[%o0 + 0x70], %f30
+	ldd		[%o0 + 0x68], %f32
+	ldd		[%o0 + 0x60], %f34
+	ldd		[%o0 + 0x58], %f36
+	ldd		[%o0 + 0x50], %f38
+	ldd		[%o0 + 0x48], %f40
+	ldd		[%o0 + 0x40], %f42
+	ldd		[%o0 + 0x38], %f44
+	ldd		[%o0 + 0x30], %f46
+	ldd		[%o0 + 0x28], %f48
+	ldd		[%o0 + 0x20], %f50
+	ldd		[%o0 + 0x18], %f52
+	ldd		[%o0 + 0x10], %f54
+	ldd		[%o0 + 0x08], %f56
+	ldd		[%o0 + 0x00], %f58
+	DECRYPT_128(20, 4, 6, 0, 2)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_decrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_decrypt_256)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0xe0], %f8
+	ldd		[%o0 + 0xe8], %f10
+	ldd		[%o0 + 0xd8], %f12
+	ldd		[%o0 + 0xd0], %f14
+	ldd		[%o0 + 0xc8], %f16
+	fxor		%f8, %f4, %f4
+	ldd		[%o0 + 0xc0], %f18
+	fxor		%f10, %f6, %f6
+	ldd		[%o0 + 0xb8], %f20
+	AES_DROUND23(12, 4, 6, 2)
+	ldd		[%o0 + 0xb0], %f22
+	AES_DROUND01(14, 4, 6, 0)
+	ldd		[%o0 + 0xa8], %f24
+	AES_DROUND23(16, 0, 2, 6)
+	ldd		[%o0 + 0xa0], %f26
+	AES_DROUND01(18, 0, 2, 4)
+	ldd		[%o0 + 0x98], %f12
+	AES_DROUND23(20, 4, 6, 2)
+	ldd		[%o0 + 0x90], %f14
+	AES_DROUND01(22, 4, 6, 0)
+	ldd		[%o0 + 0x88], %f16
+	AES_DROUND23(24, 0, 2, 6)
+	ldd		[%o0 + 0x80], %f18
+	AES_DROUND01(26, 0, 2, 4)
+	ldd		[%o0 + 0x78], %f20
+	AES_DROUND23(12, 4, 6, 2)
+	ldd		[%o0 + 0x70], %f22
+	AES_DROUND01(14, 4, 6, 0)
+	ldd		[%o0 + 0x68], %f24
+	AES_DROUND23(16, 0, 2, 6)
+	ldd		[%o0 + 0x60], %f26
+	AES_DROUND01(18, 0, 2, 4)
+	ldd		[%o0 + 0x58], %f28
+	AES_DROUND23(20, 4, 6, 2)
+	ldd		[%o0 + 0x50], %f30
+	AES_DROUND01(22, 4, 6, 0)
+	ldd		[%o0 + 0x48], %f32
+	AES_DROUND23(24, 0, 2, 6)
+	ldd		[%o0 + 0x40], %f34
+	AES_DROUND01(26, 0, 2, 4)
+	ldd		[%o0 + 0x38], %f36
+	AES_DROUND23(28, 4, 6, 2)
+	ldd		[%o0 + 0x30], %f38
+	AES_DROUND01(30, 4, 6, 0)
+	ldd		[%o0 + 0x28], %f40
+	AES_DROUND23(32, 0, 2, 6)
+	ldd		[%o0 + 0x20], %f42
+	AES_DROUND01(34, 0, 2, 4)
+	ldd		[%o0 + 0x18], %f44
+	AES_DROUND23(36, 4, 6, 2)
+	ldd		[%o0 + 0x10], %f46
+	AES_DROUND01(38, 4, 6, 0)
+	ldd		[%o0 + 0x08], %f48
+	AES_DROUND23(40, 0, 2, 6)
+	ldd		[%o0 + 0x00], %f50
+	AES_DROUND01(42, 0, 2, 4)
+	AES_DROUND23(44, 4, 6, 2)
+	AES_DROUND01(46, 4, 6, 0)
+	AES_DROUND23_L(48, 0, 2, 6)
+	AES_DROUND01_L(50, 0, 2, 4)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_decrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_load_encrypt_keys_128)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	ldd		[%o0 + 0x30], %f16
+	ldd		[%o0 + 0x38], %f18
+	ldd		[%o0 + 0x40], %f20
+	ldd		[%o0 + 0x48], %f22
+	ldd		[%o0 + 0x50], %f24
+	ldd		[%o0 + 0x58], %f26
+	ldd		[%o0 + 0x60], %f28
+	ldd		[%o0 + 0x68], %f30
+	ldd		[%o0 + 0x70], %f32
+	ldd		[%o0 + 0x78], %f34
+	ldd		[%o0 + 0x80], %f36
+	ldd		[%o0 + 0x88], %f38
+	ldd		[%o0 + 0x90], %f40
+	ldd		[%o0 + 0x98], %f42
+	ldd		[%o0 + 0xa0], %f44
+	retl
+	 ldd		[%o0 + 0xa8], %f46
+ENDPROC(aes_sparc64_load_encrypt_keys_128)
+
+	.align		32
+ENTRY(aes_sparc64_load_encrypt_keys_192)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	ldd		[%o0 + 0x30], %f16
+	ldd		[%o0 + 0x38], %f18
+	ldd		[%o0 + 0x40], %f20
+	ldd		[%o0 + 0x48], %f22
+	ldd		[%o0 + 0x50], %f24
+	ldd		[%o0 + 0x58], %f26
+	ldd		[%o0 + 0x60], %f28
+	ldd		[%o0 + 0x68], %f30
+	ldd		[%o0 + 0x70], %f32
+	ldd		[%o0 + 0x78], %f34
+	ldd		[%o0 + 0x80], %f36
+	ldd		[%o0 + 0x88], %f38
+	ldd		[%o0 + 0x90], %f40
+	ldd		[%o0 + 0x98], %f42
+	ldd		[%o0 + 0xa0], %f44
+	ldd		[%o0 + 0xa8], %f46
+	ldd		[%o0 + 0xb0], %f48
+	ldd		[%o0 + 0xb8], %f50
+	ldd		[%o0 + 0xc0], %f52
+	retl
+	 ldd		[%o0 + 0xc8], %f54
+ENDPROC(aes_sparc64_load_encrypt_keys_192)
+
+	.align		32
+ENTRY(aes_sparc64_load_encrypt_keys_256)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	ldd		[%o0 + 0x30], %f16
+	ldd		[%o0 + 0x38], %f18
+	ldd		[%o0 + 0x40], %f20
+	ldd		[%o0 + 0x48], %f22
+	ldd		[%o0 + 0x50], %f24
+	ldd		[%o0 + 0x58], %f26
+	ldd		[%o0 + 0x60], %f28
+	ldd		[%o0 + 0x68], %f30
+	ldd		[%o0 + 0x70], %f32
+	ldd		[%o0 + 0x78], %f34
+	ldd		[%o0 + 0x80], %f36
+	ldd		[%o0 + 0x88], %f38
+	ldd		[%o0 + 0x90], %f40
+	ldd		[%o0 + 0x98], %f42
+	ldd		[%o0 + 0xa0], %f44
+	ldd		[%o0 + 0xa8], %f46
+	ldd		[%o0 + 0xb0], %f48
+	ldd		[%o0 + 0xb8], %f50
+	ldd		[%o0 + 0xc0], %f52
+	ldd		[%o0 + 0xc8], %f54
+	ldd		[%o0 + 0xd0], %f56
+	ldd		[%o0 + 0xd8], %f58
+	ldd		[%o0 + 0xe0], %f60
+	retl
+	 ldd		[%o0 + 0xe8], %f62
+ENDPROC(aes_sparc64_load_encrypt_keys_256)
+
+	.align		32
+ENTRY(aes_sparc64_load_decrypt_keys_128)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x98], %f8
+	ldd		[%o0 + 0x90], %f10
+	ldd		[%o0 + 0x88], %f12
+	ldd		[%o0 + 0x80], %f14
+	ldd		[%o0 + 0x78], %f16
+	ldd		[%o0 + 0x70], %f18
+	ldd		[%o0 + 0x68], %f20
+	ldd		[%o0 + 0x60], %f22
+	ldd		[%o0 + 0x58], %f24
+	ldd		[%o0 + 0x50], %f26
+	ldd		[%o0 + 0x48], %f28
+	ldd		[%o0 + 0x40], %f30
+	ldd		[%o0 + 0x38], %f32
+	ldd		[%o0 + 0x30], %f34
+	ldd		[%o0 + 0x28], %f36
+	ldd		[%o0 + 0x20], %f38
+	ldd		[%o0 + 0x18], %f40
+	ldd		[%o0 + 0x10], %f42
+	ldd		[%o0 + 0x08], %f44
+	retl
+	 ldd		[%o0 + 0x00], %f46
+ENDPROC(aes_sparc64_load_decrypt_keys_128)
+
+	.align		32
+ENTRY(aes_sparc64_load_decrypt_keys_192)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0xb8], %f8
+	ldd		[%o0 + 0xb0], %f10
+	ldd		[%o0 + 0xa8], %f12
+	ldd		[%o0 + 0xa0], %f14
+	ldd		[%o0 + 0x98], %f16
+	ldd		[%o0 + 0x90], %f18
+	ldd		[%o0 + 0x88], %f20
+	ldd		[%o0 + 0x80], %f22
+	ldd		[%o0 + 0x78], %f24
+	ldd		[%o0 + 0x70], %f26
+	ldd		[%o0 + 0x68], %f28
+	ldd		[%o0 + 0x60], %f30
+	ldd		[%o0 + 0x58], %f32
+	ldd		[%o0 + 0x50], %f34
+	ldd		[%o0 + 0x48], %f36
+	ldd		[%o0 + 0x40], %f38
+	ldd		[%o0 + 0x38], %f40
+	ldd		[%o0 + 0x30], %f42
+	ldd		[%o0 + 0x28], %f44
+	ldd		[%o0 + 0x20], %f46
+	ldd		[%o0 + 0x18], %f48
+	ldd		[%o0 + 0x10], %f50
+	ldd		[%o0 + 0x08], %f52
+	retl
+	 ldd		[%o0 + 0x00], %f54
+ENDPROC(aes_sparc64_load_decrypt_keys_192)
+
+	.align		32
+ENTRY(aes_sparc64_load_decrypt_keys_256)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0xd8], %f8
+	ldd		[%o0 + 0xd0], %f10
+	ldd		[%o0 + 0xc8], %f12
+	ldd		[%o0 + 0xc0], %f14
+	ldd		[%o0 + 0xb8], %f16
+	ldd		[%o0 + 0xb0], %f18
+	ldd		[%o0 + 0xa8], %f20
+	ldd		[%o0 + 0xa0], %f22
+	ldd		[%o0 + 0x98], %f24
+	ldd		[%o0 + 0x90], %f26
+	ldd		[%o0 + 0x88], %f28
+	ldd		[%o0 + 0x80], %f30
+	ldd		[%o0 + 0x78], %f32
+	ldd		[%o0 + 0x70], %f34
+	ldd		[%o0 + 0x68], %f36
+	ldd		[%o0 + 0x60], %f38
+	ldd		[%o0 + 0x58], %f40
+	ldd		[%o0 + 0x50], %f42
+	ldd		[%o0 + 0x48], %f44
+	ldd		[%o0 + 0x40], %f46
+	ldd		[%o0 + 0x38], %f48
+	ldd		[%o0 + 0x30], %f50
+	ldd		[%o0 + 0x28], %f52
+	ldd		[%o0 + 0x20], %f54
+	ldd		[%o0 + 0x18], %f56
+	ldd		[%o0 + 0x10], %f58
+	ldd		[%o0 + 0x08], %f60
+	retl
+	 ldd		[%o0 + 0x00], %f62
+ENDPROC(aes_sparc64_load_decrypt_keys_256)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_encrypt_128)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 + 0x00], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	ENCRYPT_128_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	ENCRYPT_128(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_encrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_encrypt_192)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 + 0x00], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	ENCRYPT_192_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	ENCRYPT_192(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_encrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_encrypt_256)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 + 0x00], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	ENCRYPT_256_2(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f0, [%o2 + 0x10]
+	std		%f2, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	ENCRYPT_256(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_encrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_decrypt_128)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 - 0x10], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 - 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	DECRYPT_128_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz,pt		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_128(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_decrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_decrypt_192)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 - 0x10], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 - 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	DECRYPT_192_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz,pt		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_192(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_decrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_decrypt_256)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 - 0x10], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 - 0x08], %g2
+	sub		%o0, 0xf0, %o0
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	DECRYPT_256_2(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f0, [%o2 + 0x10]
+	std		%f2, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz,pt		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_256(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_decrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_encrypt_128)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd		[%o4 + 0x00], %f4
+	ldd		[%o4 + 0x08], %f6
+	ldx		[%o0 + 0x00], %g1
+	ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	ENCRYPT_128(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	std		%f4, [%o4 + 0x00]
+	std		%f6, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_encrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_encrypt_192)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd		[%o4 + 0x00], %f4
+	ldd		[%o4 + 0x08], %f6
+	ldx		[%o0 + 0x00], %g1
+	ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	ENCRYPT_192(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	std		%f4, [%o4 + 0x00]
+	std		%f6, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_encrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_encrypt_256)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd		[%o4 + 0x00], %f4
+	ldd		[%o4 + 0x08], %f6
+	ldx		[%o0 + 0x00], %g1
+	ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	ENCRYPT_256(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	std		%f4, [%o4 + 0x00]
+	std		%f6, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_encrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_decrypt_128)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */
+	ldx		[%o0 - 0x10], %g1
+	ldx		[%o0 - 0x08], %g2
+	ldx		[%o4 + 0x00], %o0
+	ldx		[%o4 + 0x08], %o5
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_128(8, 4, 6, 0, 2)
+	MOVXTOD_O0_F0
+	MOVXTOD_O5_F2
+	xor		%g1, %g3, %o0
+	xor		%g2, %g7, %o5
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	stx		%o0, [%o4 + 0x00]
+	stx		%o5, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_decrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_decrypt_192)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */
+	ldx		[%o0 - 0x10], %g1
+	ldx		[%o0 - 0x08], %g2
+	ldx		[%o4 + 0x00], %o0
+	ldx		[%o4 + 0x08], %o5
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_192(8, 4, 6, 0, 2)
+	MOVXTOD_O0_F0
+	MOVXTOD_O5_F2
+	xor		%g1, %g3, %o0
+	xor		%g2, %g7, %o5
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	stx		%o0, [%o4 + 0x00]
+	stx		%o5, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_decrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_decrypt_256)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */
+	ldx		[%o0 - 0x10], %g1
+	ldx		[%o0 - 0x08], %g2
+	ldx		[%o4 + 0x00], %o0
+	ldx		[%o4 + 0x08], %o5
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_256(8, 4, 6, 0, 2)
+	MOVXTOD_O0_F0
+	MOVXTOD_O5_F2
+	xor		%g1, %g3, %o0
+	xor		%g2, %g7, %o5
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	stx		%o0, [%o4 + 0x00]
+	stx		%o5, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_decrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_ctr_crypt_128)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldx		[%o4 + 0x00], %g3
+	ldx		[%o4 + 0x08], %g7
+	subcc		%o3, 0x10, %o3
+	ldx		[%o0 + 0x00], %g1
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F4
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F6
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_128_2(8, 0, 2, 4, 6, 56, 58, 60, 62)
+	ldd		[%o1 + 0x00], %f56
+	ldd		[%o1 + 0x08], %f58
+	ldd		[%o1 + 0x10], %f60
+	ldd		[%o1 + 0x18], %f62
+	fxor		%f56, %f0, %f56
+	fxor		%f58, %f2, %f58
+	fxor		%f60, %f4, %f60
+	fxor		%f62, %f6, %f62
+	std		%f56, [%o2 + 0x00]
+	std		%f58, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	subcc		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_128(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f4
+	ldd		[%o1 + 0x08], %f6
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	stx		%g3, [%o4 + 0x00]
+	retl
+	 stx		%g7, [%o4 + 0x08]
+ENDPROC(aes_sparc64_ctr_crypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_ctr_crypt_192)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldx		[%o4 + 0x00], %g3
+	ldx		[%o4 + 0x08], %g7
+	subcc		%o3, 0x10, %o3
+	ldx		[%o0 + 0x00], %g1
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F4
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F6
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_192_2(8, 0, 2, 4, 6, 56, 58, 60, 62)
+	ldd		[%o1 + 0x00], %f56
+	ldd		[%o1 + 0x08], %f58
+	ldd		[%o1 + 0x10], %f60
+	ldd		[%o1 + 0x18], %f62
+	fxor		%f56, %f0, %f56
+	fxor		%f58, %f2, %f58
+	fxor		%f60, %f4, %f60
+	fxor		%f62, %f6, %f62
+	std		%f56, [%o2 + 0x00]
+	std		%f58, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	subcc		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_192(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f4
+	ldd		[%o1 + 0x08], %f6
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	stx		%g3, [%o4 + 0x00]
+	retl
+	 stx		%g7, [%o4 + 0x08]
+ENDPROC(aes_sparc64_ctr_crypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_ctr_crypt_256)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldx		[%o4 + 0x00], %g3
+	ldx		[%o4 + 0x08], %g7
+	subcc		%o3, 0x10, %o3
+	ldx		[%o0 + 0x00], %g1
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F4
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F6
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_256_2(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f56
+	ldd		[%o1 + 0x08], %f58
+	ldd		[%o1 + 0x10], %f60
+	ldd		[%o1 + 0x18], %f62
+	fxor		%f56, %f0, %f56
+	fxor		%f58, %f2, %f58
+	fxor		%f60, %f4, %f60
+	fxor		%f62, %f6, %f62
+	std		%f56, [%o2 + 0x00]
+	std		%f58, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	subcc		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+	ldd		[%o0 + 0xd0], %f56
+	ldd		[%o0 + 0xd8], %f58
+	ldd		[%o0 + 0xe0], %f60
+	ldd		[%o0 + 0xe8], %f62
+10:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_256(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f4
+	ldd		[%o1 + 0x08], %f6
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	stx		%g3, [%o4 + 0x00]
+	retl
+	 stx		%g7, [%o4 + 0x08]
+ENDPROC(aes_sparc64_ctr_crypt_256)
--- a/arch/sparc/crypto/aes_glue.c
+++ b/arch/sparc/crypto/aes_glue.c
+/* Glue code for AES encryption optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/aesni-intel_glue.c
+ *
+ * Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+
+#include <asm/fpumacro.h>
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+struct aes_ops {
+	void (*encrypt)(const u64 *key, const u32 *input, u32 *output);
+	void (*decrypt)(const u64 *key, const u32 *input, u32 *output);
+	void (*load_encrypt_keys)(const u64 *key);
+	void (*load_decrypt_keys)(const u64 *key);
+	void (*ecb_encrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len);
+	void (*ecb_decrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len);
+	void (*cbc_encrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len, u64 *iv);
+	void (*cbc_decrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len, u64 *iv);
+	void (*ctr_crypt)(const u64 *key, const u64 *input, u64 *output,
+			  unsigned int len, u64 *iv);
+};
+
+struct crypto_sparc64_aes_ctx {
+	struct aes_ops *ops;
+	u64 key[AES_MAX_KEYLENGTH / sizeof(u64)];
+	u32 key_length;
+	u32 expanded_key_length;
+};
+
+extern void aes_sparc64_encrypt_128(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_encrypt_192(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_encrypt_256(const u64 *key, const u32 *input,
+				    u32 *output);
+
+extern void aes_sparc64_decrypt_128(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_decrypt_192(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_decrypt_256(const u64 *key, const u32 *input,
+				    u32 *output);
+
+extern void aes_sparc64_load_encrypt_keys_128(const u64 *key);
+extern void aes_sparc64_load_encrypt_keys_192(const u64 *key);
+extern void aes_sparc64_load_encrypt_keys_256(const u64 *key);
+
+extern void aes_sparc64_load_decrypt_keys_128(const u64 *key);
+extern void aes_sparc64_load_decrypt_keys_192(const u64 *key);
+extern void aes_sparc64_load_decrypt_keys_256(const u64 *key);
+
+extern void aes_sparc64_ecb_encrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_encrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_encrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+
+extern void aes_sparc64_ecb_decrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_decrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_decrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+
+extern void aes_sparc64_cbc_encrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_encrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_encrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_decrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_decrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_decrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_ctr_crypt_128(const u64 *key, const u64 *input,
+				      u64 *output, unsigned int len,
+				      u64 *iv);
+extern void aes_sparc64_ctr_crypt_192(const u64 *key, const u64 *input,
+				      u64 *output, unsigned int len,
+				      u64 *iv);
+extern void aes_sparc64_ctr_crypt_256(const u64 *key, const u64 *input,
+				      u64 *output, unsigned int len,
+				      u64 *iv);
+
+struct aes_ops aes128_ops = {
+	.encrypt		= aes_sparc64_encrypt_128,
+	.decrypt		= aes_sparc64_decrypt_128,
+	.load_encrypt_keys	= aes_sparc64_load_encrypt_keys_128,
+	.load_decrypt_keys	= aes_sparc64_load_decrypt_keys_128,
+	.ecb_encrypt		= aes_sparc64_ecb_encrypt_128,
+	.ecb_decrypt		= aes_sparc64_ecb_decrypt_128,
+	.cbc_encrypt		= aes_sparc64_cbc_encrypt_128,
+	.cbc_decrypt		= aes_sparc64_cbc_decrypt_128,
+	.ctr_crypt		= aes_sparc64_ctr_crypt_128,
+};
+
+struct aes_ops aes192_ops = {
+	.encrypt		= aes_sparc64_encrypt_192,
+	.decrypt		= aes_sparc64_decrypt_192,
+	.load_encrypt_keys	= aes_sparc64_load_encrypt_keys_192,
+	.load_decrypt_keys	= aes_sparc64_load_decrypt_keys_192,
+	.ecb_encrypt		= aes_sparc64_ecb_encrypt_192,
+	.ecb_decrypt		= aes_sparc64_ecb_decrypt_192,
+	.cbc_encrypt		= aes_sparc64_cbc_encrypt_192,
+	.cbc_decrypt		= aes_sparc64_cbc_decrypt_192,
+	.ctr_crypt		= aes_sparc64_ctr_crypt_192,
+};
+
+struct aes_ops aes256_ops = {
+	.encrypt		= aes_sparc64_encrypt_256,
+	.decrypt		= aes_sparc64_decrypt_256,
+	.load_encrypt_keys	= aes_sparc64_load_encrypt_keys_256,
+	.load_decrypt_keys	= aes_sparc64_load_decrypt_keys_256,
+	.ecb_encrypt		= aes_sparc64_ecb_encrypt_256,
+	.ecb_decrypt		= aes_sparc64_ecb_decrypt_256,
+	.cbc_encrypt		= aes_sparc64_cbc_encrypt_256,
+	.cbc_decrypt		= aes_sparc64_cbc_decrypt_256,
+	.ctr_crypt		= aes_sparc64_ctr_crypt_256,
+};
+
+extern void aes_sparc64_key_expand(const u32 *in_key, u64 *output_key,
+				   unsigned int key_len);
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+
+	switch (key_len) {
+	case AES_KEYSIZE_128:
+		ctx->expanded_key_length = 0xb0;
+		ctx->ops = &aes128_ops;
+		break;
+
+	case AES_KEYSIZE_192:
+		ctx->expanded_key_length = 0xd0;
+		ctx->ops = &aes192_ops;
+		break;
+
+	case AES_KEYSIZE_256:
+		ctx->expanded_key_length = 0xf0;
+		ctx->ops = &aes256_ops;
+		break;
+
+	default:
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	aes_sparc64_key_expand((const u32 *)in_key, &ctx->key[0], key_len);
+	ctx->key_length = key_len;
+
+	return 0;
+}
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->ops->encrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->ops->decrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst);
+}
+
+#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+
+static int ecb_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_encrypt_keys(&ctx->key[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->ecb_encrypt(&ctx->key[0],
+					      (const u64 *)walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	u64 *key_end;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_decrypt_keys(&ctx->key[0]);
+	key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)];
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->ecb_decrypt(key_end,
+					      (const u64 *) walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr, block_len);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+
+	return err;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_encrypt_keys(&ctx->key[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->cbc_encrypt(&ctx->key[0],
+					      (const u64 *)walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len, (u64 *) walk.iv);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	u64 *key_end;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_decrypt_keys(&ctx->key[0]);
+	key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)];
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->cbc_decrypt(key_end,
+					      (const u64 *) walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len, (u64 *) walk.iv);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+
+	return err;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc,
+		     struct scatterlist *dst, struct scatterlist *src,
+		     unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_encrypt_keys(&ctx->key[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->ctr_crypt(&ctx->key[0],
+					    (const u64 *)walk.src.virt.addr,
+					    (u64 *) walk.dst.virt.addr,
+					    block_len, (u64 *) walk.iv);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static struct crypto_alg algs[] = { {
+	.cra_name		= "aes",
+	.cra_driver_name	= "aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 3,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= aes_set_key,
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "ecb-aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "cbc-aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(aes)",
+	.cra_driver_name	= "ctr-aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+} };
+
+static bool __init sparc64_has_aes_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_AES))
+		return false;
+
+	return true;
+}
+
+static int __init aes_sparc64_mod_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++)
+		INIT_LIST_HEAD(&algs[i].cra_list);
+
+	if (sparc64_has_aes_opcode()) {
+		pr_info("Using sparc64 aes opcodes optimized AES implementation\n");
+		return crypto_register_algs(algs, ARRAY_SIZE(algs));
+	}
+	pr_info("sparc64 aes opcodes not available.\n");
+	return -ENODEV;
+}
+
+static void __exit aes_sparc64_mod_fini(void)
+{
+	crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(aes_sparc64_mod_init);
+module_exit(aes_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("AES Secure Hash Algorithm, sparc64 aes opcode accelerated");
+
+MODULE_ALIAS("aes");
--- a/arch/sparc/crypto/camellia_asm.S
+++ b/arch/sparc/crypto/camellia_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+#define CAMELLIA_6ROUNDS(KEY_BASE, I0, I1) \
+	CAMELLIA_F(KEY_BASE +  0, I1, I0, I1) \
+	CAMELLIA_F(KEY_BASE +  2, I0, I1, I0) \
+	CAMELLIA_F(KEY_BASE +  4, I1, I0, I1) \
+	CAMELLIA_F(KEY_BASE +  6, I0, I1, I0) \
+	CAMELLIA_F(KEY_BASE +  8, I1, I0, I1) \
+	CAMELLIA_F(KEY_BASE + 10, I0, I1, I0)
+
+#define CAMELLIA_6ROUNDS_FL_FLI(KEY_BASE, I0, I1) \
+	CAMELLIA_6ROUNDS(KEY_BASE, I0, I1) \
+	CAMELLIA_FL(KEY_BASE + 12, I0, I0) \
+	CAMELLIA_FLI(KEY_BASE + 14, I1, I1)
+
+	.data
+
+	.align	8
+SIGMA:	.xword	0xA09E667F3BCC908B
+	.xword	0xB67AE8584CAA73B2
+	.xword	0xC6EF372FE94F82BE
+	.xword	0x54FF53A5F1D36F1C
+	.xword	0x10E527FADE682D1D
+	.xword	0xB05688C2B3E6C1FD
+
+	.text
+
+	.align	32
+ENTRY(camellia_sparc64_key_expand)
+	/* %o0=in_key, %o1=encrypt_key, %o2=key_len, %o3=decrypt_key */
+	VISEntry
+	ld	[%o0 + 0x00], %f0	! i0, k[0]
+	ld	[%o0 + 0x04], %f1	! i1, k[1]
+	ld	[%o0 + 0x08], %f2	! i2, k[2]
+	ld	[%o0 + 0x0c], %f3	! i3, k[3]
+	std	%f0, [%o1 + 0x00]	! k[0, 1]
+	fsrc2	%f0, %f28
+	std	%f2, [%o1 + 0x08]	! k[2, 3]
+	cmp	%o2, 16
+	be	10f
+	 fsrc2	%f2, %f30
+
+	ld	[%o0 + 0x10], %f0
+	ld	[%o0 + 0x14], %f1
+	std	%f0, [%o1 + 0x20]	! k[8, 9]
+	cmp	%o2, 24
+	fone	%f10
+	be,a	1f
+	 fxor	%f10, %f0, %f2
+	ld	[%o0 + 0x18], %f2
+	ld	[%o0 + 0x1c], %f3
+1:
+	std	%f2, [%o1 + 0x28]	! k[10, 11]
+	fxor	%f28, %f0, %f0
+	fxor	%f30, %f2, %f2
+
+10:
+	sethi	%hi(SIGMA), %g3
+	or	%g3, %lo(SIGMA), %g3
+	ldd	[%g3 + 0x00], %f16
+	ldd	[%g3 + 0x08], %f18
+	ldd	[%g3 + 0x10], %f20
+	ldd	[%g3 + 0x18], %f22
+	ldd	[%g3 + 0x20], %f24
+	ldd	[%g3 + 0x28], %f26
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	fxor	%f28, %f0, %f0
+	fxor	%f30, %f2, %f2
+	CAMELLIA_F(20, 2, 0, 2)
+	CAMELLIA_F(22, 0, 2, 0)
+
+#define ROTL128(S01, S23, TMP1, TMP2, N)	\
+	srlx	S01, (64 - N), TMP1;		\
+	sllx	S01, N, S01;			\
+	srlx	S23, (64 - N), TMP2;		\
+	sllx	S23, N, S23;			\
+	or	S01, TMP2, S01;			\
+	or	S23, TMP1, S23
+
+	cmp	%o2, 16
+	bne	1f
+	 nop
+	/* 128-bit key */
+	std	%f0, [%o1 + 0x10]	! k[ 4,  5]
+	std	%f2, [%o1 + 0x18]	! k[ 6,  7]
+	MOVDTOX_F0_O4
+	MOVDTOX_F2_O5
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x30]	! k[12, 13]
+	stx	%o5, [%o1 + 0x38]	! k[14, 15]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x40]	! k[16, 17]
+	stx	%o5, [%o1 + 0x48]	! k[18, 19]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x60]	! k[24, 25]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x70]	! k[28, 29]
+	stx	%o5, [%o1 + 0x78]	! k[30, 31]
+	ROTL128(%o4, %o5, %g2, %g3, 34)
+	stx	%o4, [%o1 + 0xa0]	! k[40, 41]
+	stx	%o5, [%o1 + 0xa8]	! k[42, 43]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0xc0]	! k[48, 49]
+	stx	%o5, [%o1 + 0xc8]	! k[50, 51]
+
+	ldx	[%o1 + 0x00], %o4	! k[ 0,  1]
+	ldx	[%o1 + 0x08], %o5	! k[ 2,  3]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x20]	! k[ 8,  9]
+	stx	%o5, [%o1 + 0x28]	! k[10, 11]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x50]	! k[20, 21]
+	stx	%o5, [%o1 + 0x58]	! k[22, 23]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o5, [%o1 + 0x68]	! k[26, 27]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0x80]	! k[32, 33]
+	stx	%o5, [%o1 + 0x88]	! k[34, 35]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0x90]	! k[36, 37]
+	stx	%o5, [%o1 + 0x98]	! k[38, 39]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0xb0]	! k[44, 45]
+	stx	%o5, [%o1 + 0xb8]	! k[46, 47]
+
+	ba,pt	%xcc, 2f
+	 mov	(3 * 16 * 4), %o0
+
+1:
+	/* 192-bit or 256-bit key */
+	std	%f0, [%o1 + 0x30]	! k[12, 13]
+	std	%f2, [%o1 + 0x38]	! k[14, 15]
+	ldd	[%o1 + 0x20], %f4	! k[ 8,  9]
+	ldd	[%o1 + 0x28], %f6	! k[10, 11]
+	fxor	%f0, %f4, %f0
+	fxor	%f2, %f6, %f2
+	CAMELLIA_F(24, 2, 0, 2)
+	CAMELLIA_F(26, 0, 2, 0)
+	std	%f0, [%o1 + 0x10]	! k[ 4,  5]
+	std	%f2, [%o1 + 0x18]	! k[ 6,  7]
+	MOVDTOX_F0_O4
+	MOVDTOX_F2_O5
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x50]	! k[20, 21]
+	stx	%o5, [%o1 + 0x58]	! k[22, 23]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0xa0]	! k[40, 41]
+	stx	%o5, [%o1 + 0xa8]	! k[42, 43]
+	ROTL128(%o4, %o5, %g2, %g3, 51)
+	stx	%o4, [%o1 + 0x100]	! k[64, 65]
+	stx	%o5, [%o1 + 0x108]	! k[66, 67]
+	ldx	[%o1 + 0x20], %o4	! k[ 8,  9]
+	ldx	[%o1 + 0x28], %o5	! k[10, 11]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x20]	! k[ 8,  9]
+	stx	%o5, [%o1 + 0x28]	! k[10, 11]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x40]	! k[16, 17]
+	stx	%o5, [%o1 + 0x48]	! k[18, 19]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x90]	! k[36, 37]
+	stx	%o5, [%o1 + 0x98]	! k[38, 39]
+	ROTL128(%o4, %o5, %g2, %g3, 34)
+	stx	%o4, [%o1 + 0xd0]	! k[52, 53]
+	stx	%o5, [%o1 + 0xd8]	! k[54, 55]
+	ldx	[%o1 + 0x30], %o4	! k[12, 13]
+	ldx	[%o1 + 0x38], %o5	! k[14, 15]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x30]	! k[12, 13]
+	stx	%o5, [%o1 + 0x38]	! k[14, 15]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x70]	! k[28, 29]
+	stx	%o5, [%o1 + 0x78]	! k[30, 31]
+	srlx	%o4, 32, %g2
+	srlx	%o5, 32, %g3
+	stw	%o4, [%o1 + 0xc0]	! k[48]
+	stw	%g3, [%o1 + 0xc4]	! k[49]
+	stw	%o5, [%o1 + 0xc8]	! k[50]
+	stw	%g2, [%o1 + 0xcc]	! k[51]
+	ROTL128(%o4, %o5, %g2, %g3, 49)
+	stx	%o4, [%o1 + 0xe0]	! k[56, 57]
+	stx	%o5, [%o1 + 0xe8]	! k[58, 59]
+	ldx	[%o1 + 0x00], %o4	! k[ 0,  1]
+	ldx	[%o1 + 0x08], %o5	! k[ 2,  3]
+	ROTL128(%o4, %o5, %g2, %g3, 45)
+	stx	%o4, [%o1 + 0x60]	! k[24, 25]
+	stx	%o5, [%o1 + 0x68]	! k[26, 27]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x80]	! k[32, 33]
+	stx	%o5, [%o1 + 0x88]	! k[34, 35]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0xb0]	! k[44, 45]
+	stx	%o5, [%o1 + 0xb8]	! k[46, 47]
+	ROTL128(%o4, %o5, %g2, %g3, 34)
+	stx	%o4, [%o1 + 0xf0]	! k[60, 61]
+	stx	%o5, [%o1 + 0xf8]	! k[62, 63]
+	mov	(4 * 16 * 4), %o0
+2:
+	add	%o1, %o0, %o1
+	ldd	[%o1 + 0x00], %f0
+	ldd	[%o1 + 0x08], %f2
+	std	%f0, [%o3 + 0x00]
+	std	%f2, [%o3 + 0x08]
+	add	%o3, 0x10, %o3
+1:
+	sub	%o1, (16 * 4), %o1
+	ldd	[%o1 + 0x38], %f0
+	ldd	[%o1 + 0x30], %f2
+	ldd	[%o1 + 0x28], %f4
+	ldd	[%o1 + 0x20], %f6
+	ldd	[%o1 + 0x18], %f8
+	ldd	[%o1 + 0x10], %f10
+	std	%f0, [%o3 + 0x00]
+	std	%f2, [%o3 + 0x08]
+	std	%f4, [%o3 + 0x10]
+	std	%f6, [%o3 + 0x18]
+	std	%f8, [%o3 + 0x20]
+	std	%f10, [%o3 + 0x28]
+
+	ldd	[%o1 + 0x08], %f0
+	ldd	[%o1 + 0x00], %f2
+	std	%f0, [%o3 + 0x30]
+	std	%f2, [%o3 + 0x38]
+	subcc	%o0, (16 * 4), %o0
+	bne,pt	%icc, 1b
+	 add	%o3, (16 * 4), %o3
+
+	std	%f2, [%o3 - 0x10]
+	std	%f0, [%o3 - 0x08]
+
+	retl
+	 VISExit
+ENDPROC(camellia_sparc64_key_expand)
+
+	.align	32
+ENTRY(camellia_sparc64_crypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=key_len */
+	VISEntry
+
+	ld	[%o1 + 0x00], %f0
+	ld	[%o1 + 0x04], %f1
+	ld	[%o1 + 0x08], %f2
+	ld	[%o1 + 0x0c], %f3
+
+	ldd	[%o0 + 0x00], %f4
+	ldd	[%o0 + 0x08], %f6
+
+	cmp	%o3, 16
+	fxor	%f4, %f0, %f0
+	be	1f
+	 fxor	%f6, %f2, %f2
+
+	ldd	[%o0 + 0x10], %f8
+	ldd	[%o0 + 0x18], %f10
+	ldd	[%o0 + 0x20], %f12
+	ldd	[%o0 + 0x28], %f14
+	ldd	[%o0 + 0x30], %f16
+	ldd	[%o0 + 0x38], %f18
+	ldd	[%o0 + 0x40], %f20
+	ldd	[%o0 + 0x48], %f22
+	add	%o0, 0x40, %o0
+
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+
+1:
+	ldd	[%o0 + 0x10], %f8
+	ldd	[%o0 + 0x18], %f10
+	ldd	[%o0 + 0x20], %f12
+	ldd	[%o0 + 0x28], %f14
+	ldd	[%o0 + 0x30], %f16
+	ldd	[%o0 + 0x38], %f18
+	ldd	[%o0 + 0x40], %f20
+	ldd	[%o0 + 0x48], %f22
+	ldd	[%o0 + 0x50], %f24
+	ldd	[%o0 + 0x58], %f26
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f32
+	ldd	[%o0 + 0x78], %f34
+	ldd	[%o0 + 0x80], %f36
+	ldd	[%o0 + 0x88], %f38
+	ldd	[%o0 + 0x90], %f40
+	ldd	[%o0 + 0x98], %f42
+	ldd	[%o0 + 0xa0], %f44
+	ldd	[%o0 + 0xa8], %f46
+	ldd	[%o0 + 0xb0], %f48
+	ldd	[%o0 + 0xb8], %f50
+	ldd	[%o0 + 0xc0], %f52
+	ldd	[%o0 + 0xc8], %f54
+
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f2
+	fxor	%f54, %f0, %f0
+
+	st	%f2, [%o2 + 0x00]
+	st	%f3, [%o2 + 0x04]
+	st	%f0, [%o2 + 0x08]
+	st	%f1, [%o2 + 0x0c]
+
+	retl
+	 VISExit
+ENDPROC(camellia_sparc64_crypt)
+
+	.align	32
+ENTRY(camellia_sparc64_load_keys)
+	/* %o0=key, %o1=key_len */
+	VISEntry
+	ldd	[%o0 + 0x00], %f4
+	ldd	[%o0 + 0x08], %f6
+	ldd	[%o0 + 0x10], %f8
+	ldd	[%o0 + 0x18], %f10
+	ldd	[%o0 + 0x20], %f12
+	ldd	[%o0 + 0x28], %f14
+	ldd	[%o0 + 0x30], %f16
+	ldd	[%o0 + 0x38], %f18
+	ldd	[%o0 + 0x40], %f20
+	ldd	[%o0 + 0x48], %f22
+	ldd	[%o0 + 0x50], %f24
+	ldd	[%o0 + 0x58], %f26
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f32
+	ldd	[%o0 + 0x78], %f34
+	ldd	[%o0 + 0x80], %f36
+	ldd	[%o0 + 0x88], %f38
+	ldd	[%o0 + 0x90], %f40
+	ldd	[%o0 + 0x98], %f42
+	ldd	[%o0 + 0xa0], %f44
+	ldd	[%o0 + 0xa8], %f46
+	ldd	[%o0 + 0xb0], %f48
+	ldd	[%o0 + 0xb8], %f50
+	ldd	[%o0 + 0xc0], %f52
+	retl
+	 ldd	[%o0 + 0xc8], %f54
+ENDPROC(camellia_sparc64_load_keys)
+
+	.align	32
+ENTRY(camellia_sparc64_ecb_crypt_3_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key */
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f2
+	fxor	%f54, %f0, %f0
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	retl
+	 nop
+ENDPROC(camellia_sparc64_ecb_crypt_3_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_ecb_crypt_4_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key */
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	ldd	[%o3 + 0xd0], %f8
+	ldd	[%o3 + 0xd8], %f10
+	ldd	[%o3 + 0xe0], %f12
+	ldd	[%o3 + 0xe8], %f14
+	ldd	[%o3 + 0xf0], %f16
+	ldd	[%o3 + 0xf8], %f18
+	ldd	[%o3 + 0x100], %f20
+	ldd	[%o3 + 0x108], %f22
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2)
+	CAMELLIA_F(8, 2, 0, 2)
+	CAMELLIA_F(10, 0, 2, 0)
+	ldd	[%o3 + 0x10], %f8
+	ldd	[%o3 + 0x18], %f10
+	CAMELLIA_F(12, 2, 0, 2)
+	CAMELLIA_F(14, 0, 2, 0)
+	ldd	[%o3 + 0x20], %f12
+	ldd	[%o3 + 0x28], %f14
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	ldd	[%o3 + 0x30], %f16
+	ldd	[%o3 + 0x38], %f18
+	fxor	%f20, %f2, %f2
+	fxor	%f22, %f0, %f0
+	ldd	[%o3 + 0x40], %f20
+	ldd	[%o3 + 0x48], %f22
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	retl
+	 nop
+ENDPROC(camellia_sparc64_ecb_crypt_4_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_encrypt_3_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f60, %f0, %f0
+	fxor	%f62, %f2, %f2
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f60
+	fxor	%f54, %f0, %f62
+	std	%f60, [%o1 + 0x00]
+	std	%f62, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_encrypt_3_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_encrypt_4_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f60, %f0, %f0
+	fxor	%f62, %f2, %f2
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	ldd	[%o3 + 0xd0], %f8
+	ldd	[%o3 + 0xd8], %f10
+	ldd	[%o3 + 0xe0], %f12
+	ldd	[%o3 + 0xe8], %f14
+	ldd	[%o3 + 0xf0], %f16
+	ldd	[%o3 + 0xf8], %f18
+	ldd	[%o3 + 0x100], %f20
+	ldd	[%o3 + 0x108], %f22
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2)
+	CAMELLIA_F(8, 2, 0, 2)
+	CAMELLIA_F(10, 0, 2, 0)
+	ldd	[%o3 + 0x10], %f8
+	ldd	[%o3 + 0x18], %f10
+	CAMELLIA_F(12, 2, 0, 2)
+	CAMELLIA_F(14, 0, 2, 0)
+	ldd	[%o3 + 0x20], %f12
+	ldd	[%o3 + 0x28], %f14
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	ldd	[%o3 + 0x30], %f16
+	ldd	[%o3 + 0x38], %f18
+	fxor	%f20, %f2, %f60
+	fxor	%f22, %f0, %f62
+	ldd	[%o3 + 0x40], %f20
+	ldd	[%o3 + 0x48], %f22
+	std	%f60, [%o1 + 0x00]
+	std	%f62, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_encrypt_4_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_decrypt_3_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f56
+	ldd	[%o0 + 0x08], %f58
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f56, %f0
+	fxor	%f6, %f58, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f2
+	fxor	%f54, %f0, %f0
+	fxor	%f60, %f2, %f2
+	fxor	%f62, %f0, %f0
+	fsrc2	%f56, %f60
+	fsrc2	%f58, %f62
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_decrypt_3_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_decrypt_4_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f56
+	ldd	[%o0 + 0x08], %f58
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f56, %f0
+	fxor	%f6, %f58, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	ldd	[%o3 + 0xd0], %f8
+	ldd	[%o3 + 0xd8], %f10
+	ldd	[%o3 + 0xe0], %f12
+	ldd	[%o3 + 0xe8], %f14
+	ldd	[%o3 + 0xf0], %f16
+	ldd	[%o3 + 0xf8], %f18
+	ldd	[%o3 + 0x100], %f20
+	ldd	[%o3 + 0x108], %f22
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2)
+	CAMELLIA_F(8, 2, 0, 2)
+	CAMELLIA_F(10, 0, 2, 0)
+	ldd	[%o3 + 0x10], %f8
+	ldd	[%o3 + 0x18], %f10
+	CAMELLIA_F(12, 2, 0, 2)
+	CAMELLIA_F(14, 0, 2, 0)
+	ldd	[%o3 + 0x20], %f12
+	ldd	[%o3 + 0x28], %f14
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	ldd	[%o3 + 0x30], %f16
+	ldd	[%o3 + 0x38], %f18
+	fxor	%f20, %f2, %f2
+	fxor	%f22, %f0, %f0
+	ldd	[%o3 + 0x40], %f20
+	ldd	[%o3 + 0x48], %f22
+	fxor	%f60, %f2, %f2
+	fxor	%f62, %f0, %f0
+	fsrc2	%f56, %f60
+	fsrc2	%f58, %f62
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_decrypt_4_grand_rounds)
--- a/arch/sparc/crypto/camellia_glue.c
+++ b/arch/sparc/crypto/camellia_glue.c
+/* Glue code for CAMELLIA encryption optimized for sparc64 crypto opcodes.
+ *
+ * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+
+#include <asm/fpumacro.h>
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+#define CAMELLIA_MIN_KEY_SIZE        16
+#define CAMELLIA_MAX_KEY_SIZE        32
+#define CAMELLIA_BLOCK_SIZE          16
+#define CAMELLIA_TABLE_BYTE_LEN     272
+
+struct camellia_sparc64_ctx {
+	u64 encrypt_key[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+	u64 decrypt_key[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+	int key_len;
+};
+
+extern void camellia_sparc64_key_expand(const u32 *in_key, u64 *encrypt_key,
+					unsigned int key_len, u64 *decrypt_key);
+
+static int camellia_set_key(struct crypto_tfm *tfm, const u8 *_in_key,
+			    unsigned int key_len)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *in_key = (const u32 *) _in_key;
+	u32 *flags = &tfm->crt_flags;
+
+	if (key_len != 16 && key_len != 24 && key_len != 32) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	ctx->key_len = key_len;
+
+	camellia_sparc64_key_expand(in_key, &ctx->encrypt_key[0],
+				    key_len, &ctx->decrypt_key[0]);
+	return 0;
+}
+
+extern void camellia_sparc64_crypt(const u64 *key, const u32 *input,
+				   u32 *output, unsigned int key_len);
+
+static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	camellia_sparc64_crypt(&ctx->encrypt_key[0],
+			       (const u32 *) src,
+			       (u32 *) dst, ctx->key_len);
+}
+
+static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	camellia_sparc64_crypt(&ctx->decrypt_key[0],
+			       (const u32 *) src,
+			       (u32 *) dst, ctx->key_len);
+}
+
+extern void camellia_sparc64_load_keys(const u64 *key, unsigned int key_len);
+
+typedef void ecb_crypt_op(const u64 *input, u64 *output, unsigned int len,
+			  const u64 *key);
+
+extern ecb_crypt_op camellia_sparc64_ecb_crypt_3_grand_rounds;
+extern ecb_crypt_op camellia_sparc64_ecb_crypt_4_grand_rounds;
+
+#define CAMELLIA_BLOCK_MASK	(~(CAMELLIA_BLOCK_SIZE - 1))
+
+static int __ecb_crypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes, bool encrypt)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	ecb_crypt_op *op;
+	const u64 *key;
+	int err;
+
+	op = camellia_sparc64_ecb_crypt_3_grand_rounds;
+	if (ctx->key_len != 16)
+		op = camellia_sparc64_ecb_crypt_4_grand_rounds;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	if (encrypt)
+		key = &ctx->encrypt_key[0];
+	else
+		key = &ctx->decrypt_key[0];
+	camellia_sparc64_load_keys(key, ctx->key_len);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64;
+			u64 *dst64;
+
+			src64 = (const u64 *)walk.src.virt.addr;
+			dst64 = (u64 *) walk.dst.virt.addr;
+			op(src64, dst64, block_len, key);
+		}
+		nbytes &= CAMELLIA_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, false);
+}
+
+typedef void cbc_crypt_op(const u64 *input, u64 *output, unsigned int len,
+			  const u64 *key, u64 *iv);
+
+extern cbc_crypt_op camellia_sparc64_cbc_encrypt_3_grand_rounds;
+extern cbc_crypt_op camellia_sparc64_cbc_encrypt_4_grand_rounds;
+extern cbc_crypt_op camellia_sparc64_cbc_decrypt_3_grand_rounds;
+extern cbc_crypt_op camellia_sparc64_cbc_decrypt_4_grand_rounds;
+
+static int cbc_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	cbc_crypt_op *op;
+	const u64 *key;
+	int err;
+
+	op = camellia_sparc64_cbc_encrypt_3_grand_rounds;
+	if (ctx->key_len != 16)
+		op = camellia_sparc64_cbc_encrypt_4_grand_rounds;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	key = &ctx->encrypt_key[0];
+	camellia_sparc64_load_keys(key, ctx->key_len);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64;
+			u64 *dst64;
+
+			src64 = (const u64 *)walk.src.virt.addr;
+			dst64 = (u64 *) walk.dst.virt.addr;
+			op(src64, dst64, block_len, key,
+			   (u64 *) walk.iv);
+		}
+		nbytes &= CAMELLIA_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	cbc_crypt_op *op;
+	const u64 *key;
+	int err;
+
+	op = camellia_sparc64_cbc_decrypt_3_grand_rounds;
+	if (ctx->key_len != 16)
+		op = camellia_sparc64_cbc_decrypt_4_grand_rounds;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	key = &ctx->decrypt_key[0];
+	camellia_sparc64_load_keys(key, ctx->key_len);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64;
+			u64 *dst64;
+
+			src64 = (const u64 *)walk.src.virt.addr;
+			dst64 = (u64 *) walk.dst.virt.addr;
+			op(src64, dst64, block_len, key,
+			   (u64 *) walk.iv);
+		}
+		nbytes &= CAMELLIA_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static struct crypto_alg algs[] = { {
+	.cra_name		= "camellia",
+	.cra_driver_name	= "camellia-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_sparc64_ctx),
+	.cra_alignmask		= 3,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.cia_max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.cia_setkey		= camellia_set_key,
+			.cia_encrypt		= camellia_encrypt,
+			.cia_decrypt		= camellia_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(camellia)",
+	.cra_driver_name	= "ecb-camellia-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(camellia)",
+	.cra_driver_name	= "cbc-camellia-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}
+};
+
+static bool __init sparc64_has_camellia_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_CAMELLIA))
+		return false;
+
+	return true;
+}
+
+static int __init camellia_sparc64_mod_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++)
+		INIT_LIST_HEAD(&algs[i].cra_list);
+
+	if (sparc64_has_camellia_opcode()) {
+		pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n");
+		return crypto_register_algs(algs, ARRAY_SIZE(algs));
+	}
+	pr_info("sparc64 camellia opcodes not available.\n");
+	return -ENODEV;
+}
+
+static void __exit camellia_sparc64_mod_fini(void)
+{
+	crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(camellia_sparc64_mod_init);
+module_exit(camellia_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, sparc64 camellia opcode accelerated");
+
+MODULE_ALIAS("aes");
--- a/arch/sparc/crypto/crc32c_asm.S
+++ b/arch/sparc/crypto/crc32c_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+#include <asm/asi.h>
+
+#include "opcodes.h"
+
+ENTRY(crc32c_sparc64)
+	/* %o0=crc32p, %o1=data_ptr, %o2=len */
+	VISEntryHalf
+	lda	[%o0] ASI_PL, %f1
+1:	ldd	[%o1], %f2
+	CRC32C(0,2,0)
+	subcc	%o2, 8, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x8, %o1
+	sta	%f1, [%o0] ASI_PL
+	VISExitHalf
+2:	retl
+	 nop
+ENDPROC(crc32c_sparc64)
--- a/arch/sparc/crypto/crc32c_glue.c
+++ b/arch/sparc/crypto/crc32c_glue.c
+/* Glue code for CRC32C optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/crc32c-intel.c
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ *          Kent Liu <kent.liu@intel.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_sparc64_setkey(struct crypto_shash *hash, const u8 *key,
+				 unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*(__le32 *)mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32c_sparc64_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+extern void crc32c_sparc64(u32 *crcp, const u64 *data, unsigned int len);
+
+static void crc32c_compute(u32 *crcp, const u64 *data, unsigned int len)
+{
+	unsigned int asm_len;
+
+	asm_len = len & ~7U;
+	if (asm_len) {
+		crc32c_sparc64(crcp, data, asm_len);
+		data += asm_len / 8;
+		len -= asm_len;
+	}
+	if (len)
+		*crcp = __crc32c_le(*crcp, (const unsigned char *) data, len);
+}
+
+static int crc32c_sparc64_update(struct shash_desc *desc, const u8 *data,
+				 unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	crc32c_compute(crcp, (const u64 *) data, len);
+
+	return 0;
+}
+
+static int __crc32c_sparc64_finup(u32 *crcp, const u8 *data, unsigned int len,
+				  u8 *out)
+{
+	u32 tmp = *crcp;
+
+	crc32c_compute(&tmp, (const u64 *) data, len);
+
+	*(__le32 *) out = ~cpu_to_le32(tmp);
+	return 0;
+}
+
+static int crc32c_sparc64_finup(struct shash_desc *desc, const u8 *data,
+				unsigned int len, u8 *out)
+{
+	return __crc32c_sparc64_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *) out = ~cpu_to_le32p(crcp);
+	return 0;
+}
+
+static int crc32c_sparc64_digest(struct shash_desc *desc, const u8 *data,
+				 unsigned int len, u8 *out)
+{
+	return __crc32c_sparc64_finup(crypto_shash_ctx(desc->tfm), data, len,
+				      out);
+}
+
+static int crc32c_sparc64_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+
+	return 0;
+}
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static struct shash_alg alg = {
+	.setkey			=	crc32c_sparc64_setkey,
+	.init			=	crc32c_sparc64_init,
+	.update			=	crc32c_sparc64_update,
+	.final			=	crc32c_sparc64_final,
+	.finup			=	crc32c_sparc64_finup,
+	.digest			=	crc32c_sparc64_digest,
+	.descsize		=	sizeof(u32),
+	.digestsize		=	CHKSUM_DIGEST_SIZE,
+	.base			=	{
+		.cra_name		=	"crc32c",
+		.cra_driver_name	=	"crc32c-sparc64",
+		.cra_priority		=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		=	sizeof(u32),
+		.cra_alignmask		=	7,
+		.cra_module		=	THIS_MODULE,
+		.cra_init		=	crc32c_sparc64_cra_init,
+	}
+};
+
+static bool __init sparc64_has_crc32c_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_CRC32C))
+		return false;
+
+	return true;
+}
+
+static int __init crc32c_sparc64_mod_init(void)
+{
+	if (sparc64_has_crc32c_opcode()) {
+		pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("sparc64 crc32c opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit crc32c_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crc32c_sparc64_mod_init);
+module_exit(crc32c_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CRC32c (Castagnoli), sparc64 crc32c opcode accelerated");
+
+MODULE_ALIAS("crc32c");
--- a/arch/sparc/crypto/crop_devid.c
+++ b/arch/sparc/crypto/crop_devid.c
+#include <linux/module.h>
+#include <linux/of_device.h>
+
+/* This is a dummy device table linked into all of the crypto
+ * opcode drivers.  It serves to trigger the module autoloading
+ * mechanisms in userspace which scan the OF device tree and
+ * load any modules which have device table entries that
+ * match OF device nodes.
+ */
+static const struct of_device_id crypto_opcode_match[] = {
+	{ .name = "cpu", .compatible = "sun4v", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, crypto_opcode_match);
--- a/arch/sparc/crypto/des_asm.S
+++ b/arch/sparc/crypto/des_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+	.align	32
+ENTRY(des_sparc64_key_expand)
+	/* %o0=input_key, %o1=output_key */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	DES_KEXPAND(0, 0, 0)
+	DES_KEXPAND(0, 1, 2)
+	DES_KEXPAND(2, 3, 6)
+	DES_KEXPAND(2, 2, 4)
+	DES_KEXPAND(6, 3, 10)
+	DES_KEXPAND(6, 2, 8)
+	DES_KEXPAND(10, 3, 14)
+	DES_KEXPAND(10, 2, 12)
+	DES_KEXPAND(14, 1, 16)
+	DES_KEXPAND(16, 3, 20)
+	DES_KEXPAND(16, 2, 18)
+	DES_KEXPAND(20, 3, 24)
+	DES_KEXPAND(20, 2, 22)
+	DES_KEXPAND(24, 3, 28)
+	DES_KEXPAND(24, 2, 26)
+	DES_KEXPAND(28, 1, 30)
+	std	%f0, [%o1 + 0x00]
+	std	%f2, [%o1 + 0x08]
+	std	%f4, [%o1 + 0x10]
+	std	%f6, [%o1 + 0x18]
+	std	%f8, [%o1 + 0x20]
+	std	%f10, [%o1 + 0x28]
+	std	%f12, [%o1 + 0x30]
+	std	%f14, [%o1 + 0x38]
+	std	%f16, [%o1 + 0x40]
+	std	%f18, [%o1 + 0x48]
+	std	%f20, [%o1 + 0x50]
+	std	%f22, [%o1 + 0x58]
+	std	%f24, [%o1 + 0x60]
+	std	%f26, [%o1 + 0x68]
+	std	%f28, [%o1 + 0x70]
+	std	%f30, [%o1 + 0x78]
+	retl
+	 VISExitHalf
+ENDPROC(des_sparc64_key_expand)
+
+	.align	32
+ENTRY(des_sparc64_crypt)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ldd	[%o1 + 0x00], %f32
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	ldd	[%o0 + 0x78], %f30
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	std	%f32, [%o2 + 0x00]
+	retl
+	 VISExit
+ENDPROC(des_sparc64_crypt)
+
+	.align	32
+ENTRY(des_sparc64_load_keys)
+	/* %o0=key */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	retl
+	 ldd	[%o0 + 0x78], %f30
+ENDPROC(des_sparc64_load_keys)
+
+	.align	32
+ENTRY(des_sparc64_ecb_crypt)
+	/* %o0=input, %o1=output, %o2=len */
+1:	ldd	[%o0 + 0x00], %f32
+	add	%o0, 0x08, %o0
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	std	%f32, [%o1 + 0x00]
+	subcc	%o2, 0x08, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x08, %o1
+	retl
+	 nop
+ENDPROC(des_sparc64_ecb_crypt)
+
+	.align	32
+ENTRY(des_sparc64_cbc_encrypt)
+	/* %o0=input, %o1=output, %o2=len, %o3=IV */
+	ldd	[%o3 + 0x00], %f32
+1:	ldd	[%o0 + 0x00], %f34
+	fxor	%f32, %f34, %f32
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	std	%f32, [%o1 + 0x00]
+	add	%o0, 0x08, %o0
+	subcc	%o2, 0x08, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x08, %o1
+	retl
+	 std	%f32, [%o3 + 0x00]
+ENDPROC(des_sparc64_cbc_encrypt)
+
+	.align	32
+ENTRY(des_sparc64_cbc_decrypt)
+	/* %o0=input, %o1=output, %o2=len, %o3=IV */
+	ldd	[%o3 + 0x00], %f34
+1:	ldd	[%o0 + 0x00], %f36
+	DES_IP(36, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	fxor	%f32, %f34, %f32
+	fsrc2	%f36, %f34
+	std	%f32, [%o1 + 0x00]
+	add	%o0, 0x08, %o0
+	subcc	%o2, 0x08, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x08, %o1
+	retl
+	 std	%f36, [%o3 + 0x00]
+ENDPROC(des_sparc64_cbc_decrypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_crypt)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ldd	[%o1 + 0x00], %f32
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	ldd	[%o0 + 0x78], %f30
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	ldd	[%o0 + 0x80], %f0
+	ldd	[%o0 + 0x88], %f2
+	DES_ROUND(4, 6, 32, 32)
+	ldd	[%o0 + 0x90], %f4
+	ldd	[%o0 + 0x98], %f6
+	DES_ROUND(8, 10, 32, 32)
+	ldd	[%o0 + 0xa0], %f8
+	ldd	[%o0 + 0xa8], %f10
+	DES_ROUND(12, 14, 32, 32)
+	ldd	[%o0 + 0xb0], %f12
+	ldd	[%o0 + 0xb8], %f14
+	DES_ROUND(16, 18, 32, 32)
+	ldd	[%o0 + 0xc0], %f16
+	ldd	[%o0 + 0xc8], %f18
+	DES_ROUND(20, 22, 32, 32)
+	ldd	[%o0 + 0xd0], %f20
+	ldd	[%o0 + 0xd8], %f22
+	DES_ROUND(24, 26, 32, 32)
+	ldd	[%o0 + 0xe0], %f24
+	ldd	[%o0 + 0xe8], %f26
+	DES_ROUND(28, 30, 32, 32)
+	ldd	[%o0 + 0xf0], %f28
+	ldd	[%o0 + 0xf8], %f30
+	DES_IIP(32, 32)
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	ldd	[%o0 + 0x100], %f0
+	ldd	[%o0 + 0x108], %f2
+	DES_ROUND(4, 6, 32, 32)
+	ldd	[%o0 + 0x110], %f4
+	ldd	[%o0 + 0x118], %f6
+	DES_ROUND(8, 10, 32, 32)
+	ldd	[%o0 + 0x120], %f8
+	ldd	[%o0 + 0x128], %f10
+	DES_ROUND(12, 14, 32, 32)
+	ldd	[%o0 + 0x130], %f12
+	ldd	[%o0 + 0x138], %f14
+	DES_ROUND(16, 18, 32, 32)
+	ldd	[%o0 + 0x140], %f16
+	ldd	[%o0 + 0x148], %f18
+	DES_ROUND(20, 22, 32, 32)
+	ldd	[%o0 + 0x150], %f20
+	ldd	[%o0 + 0x158], %f22
+	DES_ROUND(24, 26, 32, 32)
+	ldd	[%o0 + 0x160], %f24
+	ldd	[%o0 + 0x168], %f26
+	DES_ROUND(28, 30, 32, 32)
+	ldd	[%o0 + 0x170], %f28
+	ldd	[%o0 + 0x178], %f30
+	DES_IIP(32, 32)
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+
+	std	%f32, [%o2 + 0x00]
+	retl
+	 VISExit
+ENDPROC(des3_ede_sparc64_crypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_load_keys)
+	/* %o0=key */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	ldd	[%o0 + 0x78], %f30
+	ldd	[%o0 + 0x80], %f32
+	ldd	[%o0 + 0x88], %f34
+	ldd	[%o0 + 0x90], %f36
+	ldd	[%o0 + 0x98], %f38
+	ldd	[%o0 + 0xa0], %f40
+	ldd	[%o0 + 0xa8], %f42
+	ldd	[%o0 + 0xb0], %f44
+	ldd	[%o0 + 0xb8], %f46
+	ldd	[%o0 + 0xc0], %f48
+	ldd	[%o0 + 0xc8], %f50
+	ldd	[%o0 + 0xd0], %f52
+	ldd	[%o0 + 0xd8], %f54
+	ldd	[%o0 + 0xe0], %f56
+	retl
+	 ldd	[%o0 + 0xe8], %f58
+ENDPROC(des3_ede_sparc64_load_keys)
+
+#define DES3_LOOP_BODY(X) \
+	DES_IP(X, X) \
+	DES_ROUND(0, 2, X, X) \
+	DES_ROUND(4, 6, X, X) \
+	DES_ROUND(8, 10, X, X) \
+	DES_ROUND(12, 14, X, X) \
+	DES_ROUND(16, 18, X, X) \
+	ldd	[%o0 + 0xf0], %f16; \
+	ldd	[%o0 + 0xf8], %f18; \
+	DES_ROUND(20, 22, X, X) \
+	ldd	[%o0 + 0x100], %f20; \
+	ldd	[%o0 + 0x108], %f22; \
+	DES_ROUND(24, 26, X, X) \
+	ldd	[%o0 + 0x110], %f24; \
+	ldd	[%o0 + 0x118], %f26; \
+	DES_ROUND(28, 30, X, X) \
+	ldd	[%o0 + 0x120], %f28; \
+	ldd	[%o0 + 0x128], %f30; \
+	DES_IIP(X, X) \
+	DES_IP(X, X) \
+	DES_ROUND(32, 34, X, X) \
+	ldd	[%o0 + 0x130], %f0; \
+	ldd	[%o0 + 0x138], %f2; \
+	DES_ROUND(36, 38, X, X) \
+	ldd	[%o0 + 0x140], %f4; \
+	ldd	[%o0 + 0x148], %f6; \
+	DES_ROUND(40, 42, X, X) \
+	ldd	[%o0 + 0x150], %f8; \
+	ldd	[%o0 + 0x158], %f10; \
+	DES_ROUND(44, 46, X, X) \
+	ldd	[%o0 + 0x160], %f12; \
+	ldd	[%o0 + 0x168], %f14; \
+	DES_ROUND(48, 50, X, X) \
+	DES_ROUND(52, 54, X, X) \
+	DES_ROUND(56, 58, X, X) \
+	DES_ROUND(16, 18, X, X) \
+	ldd	[%o0 + 0x170], %f16; \
+	ldd	[%o0 + 0x178], %f18; \
+	DES_IIP(X, X) \
+	DES_IP(X, X) \
+	DES_ROUND(20, 22, X, X) \
+	ldd	[%o0 + 0x50], %f20; \
+	ldd	[%o0 + 0x58], %f22; \
+	DES_ROUND(24, 26, X, X) \
+	ldd	[%o0 + 0x60], %f24; \
+	ldd	[%o0 + 0x68], %f26; \
+	DES_ROUND(28, 30, X, X) \
+	ldd	[%o0 + 0x70], %f28; \
+	ldd	[%o0 + 0x78], %f30; \
+	DES_ROUND(0, 2, X, X) \
+	ldd	[%o0 + 0x00], %f0; \
+	ldd	[%o0 + 0x08], %f2; \
+	DES_ROUND(4, 6, X, X) \
+	ldd	[%o0 + 0x10], %f4; \
+	ldd	[%o0 + 0x18], %f6; \
+	DES_ROUND(8, 10, X, X) \
+	ldd	[%o0 + 0x20], %f8; \
+	ldd	[%o0 + 0x28], %f10; \
+	DES_ROUND(12, 14, X, X) \
+	ldd	[%o0 + 0x30], %f12; \
+	ldd	[%o0 + 0x38], %f14; \
+	DES_ROUND(16, 18, X, X) \
+	ldd	[%o0 + 0x40], %f16; \
+	ldd	[%o0 + 0x48], %f18; \
+	DES_IIP(X, X)
+
+	.align	32
+ENTRY(des3_ede_sparc64_ecb_crypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+1:	ldd	[%o1 + 0x00], %f60
+	DES3_LOOP_BODY(60)
+	std	%f60, [%o2 + 0x00]
+	subcc	%o3, 0x08, %o3
+	bne,pt	%icc, 1b
+	 add	%o2, 0x08, %o2
+	retl
+	 nop
+ENDPROC(des3_ede_sparc64_ecb_crypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_cbc_encrypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+1:	ldd	[%o1 + 0x00], %f62
+	fxor	%f60, %f62, %f60
+	DES3_LOOP_BODY(60)
+	std	%f60, [%o2 + 0x00]
+	add	%o1, 0x08, %o1
+	subcc	%o3, 0x08, %o3
+	bne,pt	%icc, 1b
+	 add	%o2, 0x08, %o2
+	retl
+	 std	%f60, [%o4 + 0x00]
+ENDPROC(des3_ede_sparc64_cbc_encrypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_cbc_decrypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd	[%o4 + 0x00], %f62
+1:	ldx	[%o1 + 0x00], %g1
+	MOVXTOD_G1_F60
+	DES3_LOOP_BODY(60)
+	fxor	%f62, %f60, %f60
+	MOVXTOD_G1_F62
+	std	%f60, [%o2 + 0x00]
+	add	%o1, 0x08, %o1
+	subcc	%o3, 0x08, %o3
+	bne,pt	%icc, 1b
+	 add	%o2, 0x08, %o2
+	retl
+	 stx	%g1, [%o4 + 0x00]
+ENDPROC(des3_ede_sparc64_cbc_decrypt)
--- a/arch/sparc/crypto/des_glue.c
+++ b/arch/sparc/crypto/des_glue.c
+/* Glue code for DES encryption optimized for sparc64 crypto opcodes.
+ *
+ * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <crypto/des.h>
+
+#include <asm/fpumacro.h>
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+struct des_sparc64_ctx {
+	u64 encrypt_expkey[DES_EXPKEY_WORDS / 2];
+	u64 decrypt_expkey[DES_EXPKEY_WORDS / 2];
+};
+
+struct des3_ede_sparc64_ctx {
+	u64 encrypt_expkey[DES3_EDE_EXPKEY_WORDS / 2];
+	u64 decrypt_expkey[DES3_EDE_EXPKEY_WORDS / 2];
+};
+
+static void encrypt_to_decrypt(u64 *d, const u64 *e)
+{
+	const u64 *s = e + (DES_EXPKEY_WORDS / 2) - 1;
+	int i;
+
+	for (i = 0; i < DES_EXPKEY_WORDS / 2; i++)
+		*d++ = *s--;
+}
+
+extern void des_sparc64_key_expand(const u32 *input_key, u64 *key);
+
+static int des_set_key(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
+{
+	struct des_sparc64_ctx *dctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	u32 tmp[DES_EXPKEY_WORDS];
+	int ret;
+
+	/* Even though we have special instructions for key expansion,
+	 * we call des_ekey() so that we don't have to write our own
+	 * weak key detection code.
+	 */
+	ret = des_ekey(tmp, key);
+	if (unlikely(ret == 0) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) {
+		*flags |= CRYPTO_TFM_RES_WEAK_KEY;
+		return -EINVAL;
+	}
+
+	des_sparc64_key_expand((const u32 *) key, &dctx->encrypt_expkey[0]);
+	encrypt_to_decrypt(&dctx->decrypt_expkey[0], &dctx->encrypt_expkey[0]);
+
+	return 0;
+}
+
+extern void des_sparc64_crypt(const u64 *key, const u64 *input,
+			      u64 *output);
+
+static void des_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->encrypt_expkey;
+
+	des_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+static void des_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->decrypt_expkey;
+
+	des_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+extern void des_sparc64_load_keys(const u64 *key);
+
+extern void des_sparc64_ecb_crypt(const u64 *input, u64 *output,
+				  unsigned int len);
+
+#define DES_BLOCK_MASK	(~(DES_BLOCK_SIZE - 1))
+
+static int __ecb_crypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes, bool encrypt)
+{
+	struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	if (encrypt)
+		des_sparc64_load_keys(&ctx->encrypt_expkey[0]);
+	else
+		des_sparc64_load_keys(&ctx->decrypt_expkey[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			des_sparc64_ecb_crypt((const u64 *)walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, false);
+}
+
+extern void des_sparc64_cbc_encrypt(const u64 *input, u64 *output,
+				    unsigned int len, u64 *iv);
+
+static int cbc_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	des_sparc64_load_keys(&ctx->encrypt_expkey[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			des_sparc64_cbc_encrypt((const u64 *)walk.src.virt.addr,
+						(u64 *) walk.dst.virt.addr,
+						block_len, (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+extern void des_sparc64_cbc_decrypt(const u64 *input, u64 *output,
+				    unsigned int len, u64 *iv);
+
+static int cbc_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	des_sparc64_load_keys(&ctx->decrypt_expkey[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			des_sparc64_cbc_decrypt((const u64 *)walk.src.virt.addr,
+						(u64 *) walk.dst.virt.addr,
+						block_len, (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int des3_ede_set_key(struct crypto_tfm *tfm, const u8 *key,
+			    unsigned int keylen)
+{
+	struct des3_ede_sparc64_ctx *dctx = crypto_tfm_ctx(tfm);
+	const u32 *K = (const u32 *)key;
+	u32 *flags = &tfm->crt_flags;
+	u64 k1[DES_EXPKEY_WORDS / 2];
+	u64 k2[DES_EXPKEY_WORDS / 2];
+	u64 k3[DES_EXPKEY_WORDS / 2];
+
+	if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
+		     !((K[2] ^ K[4]) | (K[3] ^ K[5]))) &&
+		     (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) {
+		*flags |= CRYPTO_TFM_RES_WEAK_KEY;
+		return -EINVAL;
+	}
+
+	des_sparc64_key_expand((const u32 *)key, k1);
+	key += DES_KEY_SIZE;
+	des_sparc64_key_expand((const u32 *)key, k2);
+	key += DES_KEY_SIZE;
+	des_sparc64_key_expand((const u32 *)key, k3);
+
+	memcpy(&dctx->encrypt_expkey[0], &k1[0], sizeof(k1));
+	encrypt_to_decrypt(&dctx->encrypt_expkey[DES_EXPKEY_WORDS / 2], &k2[0]);
+	memcpy(&dctx->encrypt_expkey[(DES_EXPKEY_WORDS / 2) * 2],
+	       &k3[0], sizeof(k3));
+
+	encrypt_to_decrypt(&dctx->decrypt_expkey[0], &k3[0]);
+	memcpy(&dctx->decrypt_expkey[DES_EXPKEY_WORDS / 2],
+	       &k2[0], sizeof(k2));
+	encrypt_to_decrypt(&dctx->decrypt_expkey[(DES_EXPKEY_WORDS / 2) * 2],
+			   &k1[0]);
+
+	return 0;
+}
+
+extern void des3_ede_sparc64_crypt(const u64 *key, const u64 *input,
+				   u64 *output);
+
+static void des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->encrypt_expkey;
+
+	des3_ede_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+static void des3_ede_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->decrypt_expkey;
+
+	des3_ede_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+extern void des3_ede_sparc64_load_keys(const u64 *key);
+
+extern void des3_ede_sparc64_ecb_crypt(const u64 *expkey, const u64 *input,
+				       u64 *output, unsigned int len);
+
+static int __ecb3_crypt(struct blkcipher_desc *desc,
+			struct scatterlist *dst, struct scatterlist *src,
+			unsigned int nbytes, bool encrypt)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	const u64 *K;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	if (encrypt)
+		K = &ctx->encrypt_expkey[0];
+	else
+		K = &ctx->decrypt_expkey[0];
+	des3_ede_sparc64_load_keys(K);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64 = (const u64 *)walk.src.virt.addr;
+			des3_ede_sparc64_ecb_crypt(K, src64,
+						   (u64 *) walk.dst.virt.addr,
+						   block_len);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb3_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb3_crypt(desc, dst, src, nbytes, true);
+}
+
+static int ecb3_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb3_crypt(desc, dst, src, nbytes, false);
+}
+
+extern void des3_ede_sparc64_cbc_encrypt(const u64 *expkey, const u64 *input,
+					 u64 *output, unsigned int len,
+					 u64 *iv);
+
+static int cbc3_encrypt(struct blkcipher_desc *desc,
+			struct scatterlist *dst, struct scatterlist *src,
+			unsigned int nbytes)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	const u64 *K;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	K = &ctx->encrypt_expkey[0];
+	des3_ede_sparc64_load_keys(K);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64 = (const u64 *)walk.src.virt.addr;
+			des3_ede_sparc64_cbc_encrypt(K, src64,
+						     (u64 *) walk.dst.virt.addr,
+						     block_len,
+						     (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+extern void des3_ede_sparc64_cbc_decrypt(const u64 *expkey, const u64 *input,
+					 u64 *output, unsigned int len,
+					 u64 *iv);
+
+static int cbc3_decrypt(struct blkcipher_desc *desc,
+			struct scatterlist *dst, struct scatterlist *src,
+			unsigned int nbytes)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	const u64 *K;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	K = &ctx->decrypt_expkey[0];
+	des3_ede_sparc64_load_keys(K);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64 = (const u64 *)walk.src.virt.addr;
+			des3_ede_sparc64_cbc_decrypt(K, src64,
+						     (u64 *) walk.dst.virt.addr,
+						     block_len,
+						     (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static struct crypto_alg algs[] = { {
+	.cra_name		= "des",
+	.cra_driver_name	= "des-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= DES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= DES_KEY_SIZE,
+			.cia_max_keysize	= DES_KEY_SIZE,
+			.cia_setkey		= des_set_key,
+			.cia_encrypt		= des_encrypt,
+			.cia_decrypt		= des_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(des)",
+	.cra_driver_name	= "ecb-des-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES_KEY_SIZE,
+			.max_keysize	= DES_KEY_SIZE,
+			.setkey		= des_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(des)",
+	.cra_driver_name	= "cbc-des-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES_KEY_SIZE,
+			.max_keysize	= DES_KEY_SIZE,
+			.setkey		= des_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "des3_ede",
+	.cra_driver_name	= "des3_ede-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= DES3_EDE_KEY_SIZE,
+			.cia_max_keysize	= DES3_EDE_KEY_SIZE,
+			.cia_setkey		= des3_ede_set_key,
+			.cia_encrypt		= des3_ede_encrypt,
+			.cia_decrypt		= des3_ede_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(des3_ede)",
+	.cra_driver_name	= "ecb-des3_ede-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES3_EDE_KEY_SIZE,
+			.max_keysize	= DES3_EDE_KEY_SIZE,
+			.setkey		= des3_ede_set_key,
+			.encrypt	= ecb3_encrypt,
+			.decrypt	= ecb3_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(des3_ede)",
+	.cra_driver_name	= "cbc-des3_ede-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES3_EDE_KEY_SIZE,
+			.max_keysize	= DES3_EDE_KEY_SIZE,
+			.setkey		= des3_ede_set_key,
+			.encrypt	= cbc3_encrypt,
+			.decrypt	= cbc3_decrypt,
+		},
+	},
+} };
+
+static bool __init sparc64_has_des_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_DES))
+		return false;
+
+	return true;
+}
+
+static int __init des_sparc64_mod_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++)
+		INIT_LIST_HEAD(&algs[i].cra_list);
+
+	if (sparc64_has_des_opcode()) {
+		pr_info("Using sparc64 des opcodes optimized DES implementation\n");
+		return crypto_register_algs(algs, ARRAY_SIZE(algs));
+	}
+	pr_info("sparc64 des opcodes not available.\n");
+	return -ENODEV;
+}
+
+static void __exit des_sparc64_mod_fini(void)
+{
+	crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(des_sparc64_mod_init);
+module_exit(des_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms, sparc64 des opcode accelerated");
+
+MODULE_ALIAS("des");
--- a/arch/sparc/crypto/md5_asm.S
+++ b/arch/sparc/crypto/md5_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(md5_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x08], %f2
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x0c], %f3
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	MD5
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	MD5
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(md5_sparc64_transform)
--- a/arch/sparc/crypto/md5_glue.c
+++ b/arch/sparc/crypto/md5_glue.c
+/* Glue code for MD5 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c
+ * and crypto/md5.c which are:
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/md5.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void md5_sparc64_transform(u32 *digest, const char *data,
+				      unsigned int rounds);
+
+static int md5_sparc64_init(struct shash_desc *desc)
+{
+	struct md5_state *mctx = shash_desc_ctx(desc);
+
+	mctx->hash[0] = cpu_to_le32(0x67452301);
+	mctx->hash[1] = cpu_to_le32(0xefcdab89);
+	mctx->hash[2] = cpu_to_le32(0x98badcfe);
+	mctx->hash[3] = cpu_to_le32(0x10325476);
+	mctx->byte_count = 0;
+
+	return 0;
+}
+
+static void __md5_sparc64_update(struct md5_state *sctx, const u8 *data,
+				 unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	sctx->byte_count += len;
+	if (partial) {
+		done = MD5_HMAC_BLOCK_SIZE - partial;
+		memcpy((u8 *)sctx->block + partial, data, done);
+		md5_sparc64_transform(sctx->hash, (u8 *)sctx->block, 1);
+	}
+	if (len - done >= MD5_HMAC_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / MD5_HMAC_BLOCK_SIZE;
+
+		md5_sparc64_transform(sctx->hash, data + done, rounds);
+		done += rounds * MD5_HMAC_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->block, data + done, len - done);
+}
+
+static int md5_sparc64_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->byte_count % MD5_HMAC_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < MD5_HMAC_BLOCK_SIZE) {
+		sctx->byte_count += len;
+		memcpy((u8 *)sctx->block + partial, data, len);
+	} else
+		__md5_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+/* Add padding and return the message digest. */
+static int md5_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	u32 *dst = (u32 *)out;
+	__le64 bits;
+	static const u8 padding[MD5_HMAC_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_le64(sctx->byte_count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->byte_count % MD5_HMAC_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((MD5_HMAC_BLOCK_SIZE+56) - index);
+
+	/* We need to fill a whole block for __md5_sparc64_update() */
+	if (padlen <= 56) {
+		sctx->byte_count += padlen;
+		memcpy((u8 *)sctx->block + index, padding, padlen);
+	} else {
+		__md5_sparc64_update(sctx, padding, padlen, index);
+	}
+	__md5_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < MD5_HASH_WORDS; i++)
+		dst[i] = sctx->hash[i];
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int md5_sparc64_export(struct shash_desc *desc, void *out)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int md5_sparc64_import(struct shash_desc *desc, const void *in)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	MD5_DIGEST_SIZE,
+	.init		=	md5_sparc64_init,
+	.update		=	md5_sparc64_update,
+	.final		=	md5_sparc64_final,
+	.export		=	md5_sparc64_export,
+	.import		=	md5_sparc64_import,
+	.descsize	=	sizeof(struct md5_state),
+	.statesize	=	sizeof(struct md5_state),
+	.base		=	{
+		.cra_name	=	"md5",
+		.cra_driver_name=	"md5-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_md5_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_MD5))
+		return false;
+
+	return true;
+}
+
+static int __init md5_sparc64_mod_init(void)
+{
+	if (sparc64_has_md5_opcode()) {
+		pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("sparc64 md5 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit md5_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(md5_sparc64_mod_init);
+module_exit(md5_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, sparc64 md5 opcode accelerated");
+
+MODULE_ALIAS("md5");
--- a/arch/sparc/crypto/opcodes.h
+++ b/arch/sparc/crypto/opcodes.h
+#ifndef _OPCODES_H
+#define _OPCODES_H
+
+#define SPARC_CR_OPCODE_PRIORITY	300
+
+#define F3F(x,y,z)	(((x)<<30)|((y)<<19)|((z)<<5))
+
+#define FPD_ENCODE(x)	(((x) >> 5) | ((x) & ~(0x20)))
+
+#define RS1(x)		(FPD_ENCODE(x) << 14)
+#define RS2(x)		(FPD_ENCODE(x) <<  0)
+#define RS3(x)		(FPD_ENCODE(x) <<  9)
+#define RD(x)		(FPD_ENCODE(x) << 25)
+#define IMM5_0(x)	((x)           <<  0)
+#define IMM5_9(x)	((x)           <<  9)
+
+#define CRC32C(a,b,c)	\
+	.word		(F3F(2,0x36,0x147)|RS1(a)|RS2(b)|RD(c));
+
+#define MD5		\
+	.word	0x81b02800;
+#define SHA1		\
+	.word	0x81b02820;
+#define SHA256		\
+	.word	0x81b02840;
+#define SHA512		\
+	.word	0x81b02860;
+
+#define AES_EROUND01(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 0)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND23(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 1)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND01(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 2)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND23(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 3)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND01_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 4)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND23_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 5)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND01_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 6)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND23_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 7)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_KEXPAND1(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 8)|RS1(a)|RS2(b)|IMM5_9(c)|RD(d));
+#define AES_KEXPAND0(a,b,c)	\
+	.word	(F3F(2, 0x36, 0x130)|RS1(a)|RS2(b)|RD(c));
+#define AES_KEXPAND2(a,b,c)	\
+	.word	(F3F(2, 0x36, 0x131)|RS1(a)|RS2(b)|RD(c));
+
+#define DES_IP(a,b)		\
+	.word		(F3F(2, 0x36, 0x134)|RS1(a)|RD(b));
+#define DES_IIP(a,b)		\
+	.word		(F3F(2, 0x36, 0x135)|RS1(a)|RD(b));
+#define DES_KEXPAND(a,b,c)	\
+	.word		(F3F(2, 0x36, 0x136)|RS1(a)|IMM5_0(b)|RD(c));
+#define DES_ROUND(a,b,c,d)	\
+	.word		(F3F(2, 0x19, 0x009)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+
+#define CAMELLIA_F(a,b,c,d)		\
+	.word		(F3F(2, 0x19, 0x00c)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define CAMELLIA_FL(a,b,c)		\
+	.word		(F3F(2, 0x36, 0x13c)|RS1(a)|RS2(b)|RD(c));
+#define CAMELLIA_FLI(a,b,c)		\
+	.word		(F3F(2, 0x36, 0x13d)|RS1(a)|RS2(b)|RD(c));
+
+#define MOVDTOX_F0_O4		\
+	.word	0x99b02200
+#define MOVDTOX_F2_O5		\
+	.word	0x9bb02202
+#define MOVXTOD_G1_F60 		\
+	.word	0xbbb02301
+#define MOVXTOD_G1_F62 		\
+	.word	0xbfb02301
+#define MOVXTOD_G3_F4		\
+	.word	0x89b02303;
+#define MOVXTOD_G7_F6		\
+	.word	0x8db02307;
+#define MOVXTOD_G3_F0		\
+	.word	0x81b02303;
+#define MOVXTOD_G7_F2		\
+	.word	0x85b02307;
+#define MOVXTOD_O0_F0		\
+	.word	0x81b02308;
+#define MOVXTOD_O5_F0		\
+	.word	0x81b0230d;
+#define MOVXTOD_O5_F2		\
+	.word	0x85b0230d;
+#define MOVXTOD_O5_F4		\
+	.word	0x89b0230d;
+#define MOVXTOD_O5_F6		\
+	.word	0x8db0230d;
+#define MOVXTOD_G3_F60		\
+	.word	0xbbb02303;
+#define MOVXTOD_G7_F62		\
+	.word	0xbfb02307;
+
+#endif /* _OPCODES_H */
--- a/arch/sparc/crypto/sha1_asm.S
+++ b/arch/sparc/crypto/sha1_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(sha1_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x0c], %f3
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x10], %f4
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha1_sparc64_transform)
--- a/arch/sparc/crypto/sha1_glue.c
+++ b/arch/sparc/crypto/sha1_glue.c
+/* Glue code for SHA1 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void sha1_sparc64_transform(u32 *digest, const char *data,
+				       unsigned int rounds);
+
+static int sha1_sparc64_init(struct shash_desc *desc)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha1_state){
+		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+	};
+
+	return 0;
+}
+
+static void __sha1_sparc64_update(struct sha1_state *sctx, const u8 *data,
+				  unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	sctx->count += len;
+	if (partial) {
+		done = SHA1_BLOCK_SIZE - partial;
+		memcpy(sctx->buffer + partial, data, done);
+		sha1_sparc64_transform(sctx->state, sctx->buffer, 1);
+	}
+	if (len - done >= SHA1_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+		sha1_sparc64_transform(sctx->state, data + done, rounds);
+		done += rounds * SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buffer, data + done, len - done);
+}
+
+static int sha1_sparc64_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA1_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buffer + partial, data, len);
+	} else
+		__sha1_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+/* Add padding and return the message digest. */
+static int sha1_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA1_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
+
+	/* We need to fill a whole block for __sha1_sparc64_update() */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buffer + index, padding, padlen);
+	} else {
+		__sha1_sparc64_update(sctx, padding, padlen, index);
+	}
+	__sha1_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 5; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_sparc64_export(struct shash_desc *desc, void *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_sparc64_import(struct shash_desc *desc, const void *in)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_sparc64_init,
+	.update		=	sha1_sparc64_update,
+	.final		=	sha1_sparc64_final,
+	.export		=	sha1_sparc64_export,
+	.import		=	sha1_sparc64_import,
+	.descsize	=	sizeof(struct sha1_state),
+	.statesize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name	=	"sha1",
+		.cra_driver_name=	"sha1-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA1_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_sha1_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA1))
+		return false;
+
+	return true;
+}
+
+static int __init sha1_sparc64_mod_init(void)
+{
+	if (sparc64_has_sha1_opcode()) {
+		pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("sparc64 sha1 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit sha1_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_sparc64_mod_init);
+module_exit(sha1_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, sparc64 sha1 opcode accelerated");
+
+MODULE_ALIAS("sha1");
--- a/arch/sparc/crypto/sha256_asm.S
+++ b/arch/sparc/crypto/sha256_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(sha256_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	ld	[%o0 + 0x0c], %f3
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x18], %f6
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x1c], %f7
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	st	%f5, [%o0 + 0x14]
+	st	%f6, [%o0 + 0x18]
+	st	%f7, [%o0 + 0x1c]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha256_sparc64_transform)
--- a/arch/sparc/crypto/sha256_glue.c
+++ b/arch/sparc/crypto/sha256_glue.c
+/* Glue code for SHA256 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon crypto/sha256_generic.c
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void sha256_sparc64_transform(u32 *digest, const char *data,
+					 unsigned int rounds);
+
+static int sha224_sparc64_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int sha256_sparc64_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static void __sha256_sparc64_update(struct sha256_state *sctx, const u8 *data,
+				    unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	sctx->count += len;
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_sparc64_transform(sctx->state, sctx->buf, 1);
+	}
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_sparc64_transform(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+}
+
+static int sha256_sparc64_update(struct shash_desc *desc, const u8 *data,
+				 unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+	} else
+		__sha256_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+static int sha256_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56) - index);
+
+	/* We need to fill a whole block for __sha256_sparc64_update() */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha256_sparc64_update(sctx, padding, padlen, index);
+	}
+	__sha256_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_sparc64_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_sparc64_final(desc, D);
+
+	memcpy(hash, D, SHA224_DIGEST_SIZE);
+	memset(D, 0, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+static int sha256_sparc64_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+	return 0;
+}
+
+static int sha256_sparc64_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+	return 0;
+}
+
+static struct shash_alg sha256 = {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_sparc64_init,
+	.update		=	sha256_sparc64_update,
+	.final		=	sha256_sparc64_final,
+	.export		=	sha256_sparc64_export,
+	.import		=	sha256_sparc64_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name=	"sha256-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static struct shash_alg sha224 = {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_sparc64_init,
+	.update		=	sha256_sparc64_update,
+	.final		=	sha224_sparc64_final,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name=	"sha224-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_sha256_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA256))
+		return false;
+
+	return true;
+}
+
+static int __init sha256_sparc64_mod_init(void)
+{
+	if (sparc64_has_sha256_opcode()) {
+		int ret = crypto_register_shash(&sha224);
+		if (ret < 0)
+			return ret;
+
+		ret = crypto_register_shash(&sha256);
+		if (ret < 0) {
+			crypto_unregister_shash(&sha224);
+			return ret;
+		}
+
+		pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+		return 0;
+	}
+	pr_info("sparc64 sha256 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit sha256_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&sha224);
+	crypto_unregister_shash(&sha256);
+}
+
+module_init(sha256_sparc64_mod_init);
+module_exit(sha256_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, sparc64 sha256 opcode accelerated");
+
+MODULE_ALIAS("sha224");
+MODULE_ALIAS("sha256");
--- a/arch/sparc/crypto/sha512_asm.S
+++ b/arch/sparc/crypto/sha512_asm.S
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(sha512_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	andcc	%o1, 0x7, %g0
+	ldd	[%o0 + 0x30], %f12
+	bne,pn	%xcc, 10f
+	 ldd	[%o0 + 0x38], %f14
+
+1:
+	ldd	[%o1 + 0x00], %f16
+	ldd	[%o1 + 0x08], %f18
+	ldd	[%o1 + 0x10], %f20
+	ldd	[%o1 + 0x18], %f22
+	ldd	[%o1 + 0x20], %f24
+	ldd	[%o1 + 0x28], %f26
+	ldd	[%o1 + 0x30], %f28
+	ldd	[%o1 + 0x38], %f30
+	ldd	[%o1 + 0x40], %f32
+	ldd	[%o1 + 0x48], %f34
+	ldd	[%o1 + 0x50], %f36
+	ldd	[%o1 + 0x58], %f38
+	ldd	[%o1 + 0x60], %f40
+	ldd	[%o1 + 0x68], %f42
+	ldd	[%o1 + 0x70], %f44
+	ldd	[%o1 + 0x78], %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+5:
+	std	%f0, [%o0 + 0x00]
+	std	%f2, [%o0 + 0x08]
+	std	%f4, [%o0 + 0x10]
+	std	%f6, [%o0 + 0x18]
+	std	%f8, [%o0 + 0x20]
+	std	%f10, [%o0 + 0x28]
+	std	%f12, [%o0 + 0x30]
+	std	%f14, [%o0 + 0x38]
+	retl
+	 VISExit
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f18
+1:
+	ldd	[%o1 + 0x08], %f20
+	ldd	[%o1 + 0x10], %f22
+	ldd	[%o1 + 0x18], %f24
+	ldd	[%o1 + 0x20], %f26
+	ldd	[%o1 + 0x28], %f28
+	ldd	[%o1 + 0x30], %f30
+	ldd	[%o1 + 0x38], %f32
+	ldd	[%o1 + 0x40], %f34
+	ldd	[%o1 + 0x48], %f36
+	ldd	[%o1 + 0x50], %f38
+	ldd	[%o1 + 0x58], %f40
+	ldd	[%o1 + 0x60], %f42
+	ldd	[%o1 + 0x68], %f44
+	ldd	[%o1 + 0x70], %f46
+	ldd	[%o1 + 0x78], %f48
+	ldd	[%o1 + 0x80], %f50
+
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+	faligndata %f26, %f28, %f24
+	faligndata %f28, %f30, %f26
+	faligndata %f30, %f32, %f28
+	faligndata %f32, %f34, %f30
+	faligndata %f34, %f36, %f32
+	faligndata %f36, %f38, %f34
+	faligndata %f38, %f40, %f36
+	faligndata %f40, %f42, %f38
+	faligndata %f42, %f44, %f40
+	faligndata %f44, %f46, %f42
+	faligndata %f46, %f48, %f44
+	faligndata %f48, %f50, %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f50, %f18
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha512_sparc64_transform)
--- a/arch/sparc/crypto/sha512_glue.c
+++ b/arch/sparc/crypto/sha512_glue.c
+/* Glue code for SHA512 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon crypto/sha512_generic.c
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void sha512_sparc64_transform(u64 *digest, const char *data,
+					 unsigned int rounds);
+
+static int sha512_sparc64_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA512_H0;
+	sctx->state[1] = SHA512_H1;
+	sctx->state[2] = SHA512_H2;
+	sctx->state[3] = SHA512_H3;
+	sctx->state[4] = SHA512_H4;
+	sctx->state[5] = SHA512_H5;
+	sctx->state[6] = SHA512_H6;
+	sctx->state[7] = SHA512_H7;
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int sha384_sparc64_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA384_H0;
+	sctx->state[1] = SHA384_H1;
+	sctx->state[2] = SHA384_H2;
+	sctx->state[3] = SHA384_H3;
+	sctx->state[4] = SHA384_H4;
+	sctx->state[5] = SHA384_H5;
+	sctx->state[6] = SHA384_H6;
+	sctx->state[7] = SHA384_H7;
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static void __sha512_sparc64_update(struct sha512_state *sctx, const u8 *data,
+				    unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	if ((sctx->count[0] += len) < len)
+		sctx->count[1]++;
+	if (partial) {
+		done = SHA512_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha512_sparc64_transform(sctx->state, sctx->buf, 1);
+	}
+	if (len - done >= SHA512_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+
+		sha512_sparc64_transform(sctx->state, data + done, rounds);
+		done += rounds * SHA512_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+}
+
+static int sha512_sparc64_update(struct shash_desc *desc, const u8 *data,
+				 unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA512_BLOCK_SIZE) {
+		if ((sctx->count[0] += len) < len)
+			sctx->count[1]++;
+		memcpy(sctx->buf + partial, data, len);
+	} else
+		__sha512_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+static int sha512_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be64 *dst = (__be64 *)out;
+	__be64 bits[2];
+	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+	/* Save number of bits */
+	bits[1] = cpu_to_be64(sctx->count[0] << 3);
+	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
+
+	/* Pad out to 112 mod 128 and append length */
+	index = sctx->count[0] % SHA512_BLOCK_SIZE;
+	padlen = (index < 112) ? (112 - index) : ((SHA512_BLOCK_SIZE+112) - index);
+
+	/* We need to fill a whole block for __sha512_sparc64_update() */
+	if (padlen <= 112) {
+		if ((sctx->count[0] += padlen) < padlen)
+			sctx->count[1]++;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha512_sparc64_update(sctx, padding, padlen, index);
+	}
+	__sha512_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 112);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be64(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha384_sparc64_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[64];
+
+	sha512_sparc64_final(desc, D);
+
+	memcpy(hash, D, 48);
+	memset(D, 0, 64);
+
+	return 0;
+}
+
+static struct shash_alg sha512 = {
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.init		=	sha512_sparc64_init,
+	.update		=	sha512_sparc64_update,
+	.final		=	sha512_sparc64_final,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha512",
+		.cra_driver_name=	"sha512-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static struct shash_alg sha384 = {
+	.digestsize	=	SHA384_DIGEST_SIZE,
+	.init		=	sha384_sparc64_init,
+	.update		=	sha512_sparc64_update,
+	.final		=	sha384_sparc64_final,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha384",
+		.cra_driver_name=	"sha384-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_sha512_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA512))
+		return false;
+
+	return true;
+}
+
+static int __init sha512_sparc64_mod_init(void)
+{
+	if (sparc64_has_sha512_opcode()) {
+		int ret = crypto_register_shash(&sha384);
+		if (ret < 0)
+			return ret;
+
+		ret = crypto_register_shash(&sha512);
+		if (ret < 0) {
+			crypto_unregister_shash(&sha384);
+			return ret;
+		}
+
+		pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n");
+		return 0;
+	}
+	pr_info("sparc64 sha512 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit sha512_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&sha384);
+	crypto_unregister_shash(&sha512);
+}
+
+module_init(sha512_sparc64_mod_init);
+module_exit(sha512_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-384 and SHA-512 Secure Hash Algorithm, sparc64 sha512 opcode accelerated");
+
+MODULE_ALIAS("sha384");
+MODULE_ALIAS("sha512");
--- a/arch/sparc/include/asm/asi.h
+++ b/arch/sparc/include/asm/asi.h
@@ -141,7 +141,8 @@
 /* SpitFire and later extended ASIs.  The "(III)" marker designates
 * UltraSparc-III and later specific ASIs.  The "(CMT)" marker designates
 * Chip Multi Threading specific ASIs.  "(NG)" designates Niagara specific
- * ASIs, "(4V)" designates SUN4V specific ASIs.
+ * ASIs, "(4V)" designates SUN4V specific ASIs.  "(NG4)" designates SPARC-T4
+ * and later ASIs.
 */
 #define ASI_PHYS_USE_EC		0x14 /* PADDR, E-cachable		*/
 #define ASI_PHYS_BYPASS_EC_E	0x15 /* PADDR, E-bit			*/
@@ -243,6 +244,7 @@
 #define ASI_UDBL_CONTROL_R	0x7f /* External UDB control regs rd low*/
 #define ASI_INTR_R		0x7f /* IRQ vector dispatch read	*/
 #define ASI_INTR_DATAN_R	0x7f /* (III) In irq vector data reg N	*/
+#define ASI_PIC			0xb0 /* (NG4) PIC registers		*/
 #define ASI_PST8_P		0xc0 /* Primary, 8 8-bit, partial	*/
 #define ASI_PST8_S		0xc1 /* Secondary, 8 8-bit, partial	*/
 #define ASI_PST16_P		0xc2 /* Primary, 4 16-bit, partial	*/

--- a/arch/sparc/include/asm/elf_64.h
+++ b/arch/sparc/include/asm/elf_64.h
@@ -86,6 +86,15 @@
 #define AV_SPARC_IMA		0x00400000 /* integer multiply-add */
 #define AV_SPARC_ASI_CACHE_SPARING \
 				0x00800000 /* cache sparing ASIs available */
+#define AV_SPARC_PAUSE		0x01000000 /* PAUSE available */
+#define AV_SPARC_CBCOND		0x02000000 /* CBCOND insns available */
+
+/* Solaris decided to enumerate every single crypto instruction type
+ * in the AT_HWCAP bits.  This is wasteful, since if crypto is present,
+ * you still need to look in the CFR register to see if the opcode is
+ * really available.  So we simply advertise only "crypto" support.
+ */
+#define HWCAP_SPARC_CRYPTO	0x04000000 /* CRYPTO insns available */

 #define CORE_DUMP_USE_REGSET


--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -2934,6 +2934,16 @@ extern unsigned long sun4v_reboot_data_set(unsigned long ra,
 					   unsigned long len);
 #endif

+#define HV_FAST_VT_GET_PERFREG		0x184
+#define HV_FAST_VT_SET_PERFREG		0x185
+
+#ifndef __ASSEMBLY__
+extern unsigned long sun4v_vt_get_perfreg(unsigned long reg_num,
+					  unsigned long *reg_val);
+extern unsigned long sun4v_vt_set_perfreg(unsigned long reg_num,
+					  unsigned long reg_val);
+#endif
+
 /* Function numbers for HV_CORE_TRAP.  */
 #define HV_CORE_SET_VER			0x00
 #define HV_CORE_PUTCHAR			0x01
@@ -2964,6 +2974,7 @@ extern unsigned long sun4v_reboot_data_set(unsigned long ra,
 #define HV_GRP_NIU			0x0204
 #define HV_GRP_VF_CPU			0x0205
 #define HV_GRP_KT_CPU			0x0209
+#define HV_GRP_VT_CPU			0x020c
 #define HV_GRP_DIAG			0x0300

 #ifndef __ASSEMBLY__

--- a/arch/sparc/include/asm/mdesc.h
+++ b/arch/sparc/include/asm/mdesc.h
@@ -73,6 +73,7 @@ extern void mdesc_register_notifier(struct mdesc_notifier_client *client);

 extern void mdesc_fill_in_cpu_data(cpumask_t *mask);
 extern void mdesc_populate_present_mask(cpumask_t *mask);
+extern void mdesc_get_page_sizes(cpumask_t *mask, unsigned long *pgsz_mask);

 extern void sun4v_mdesc_init(void);


--- a/arch/sparc/include/asm/pcr.h
+++ b/arch/sparc/include/asm/pcr.h
@@ -2,8 +2,13 @@
 #define __PCR_H

 struct pcr_ops {
-	u64 (*read)(void);
-	void (*write)(u64);
+	u64 (*read_pcr)(unsigned long);
+	void (*write_pcr)(unsigned long, u64);
+	u64 (*read_pic)(unsigned long);
+	void (*write_pic)(unsigned long, u64);
+	u64 (*nmi_picl_value)(unsigned int nmi_hz);
+	u64 pcr_nmi_enable;
+	u64 pcr_nmi_disable;
 };
 extern const struct pcr_ops *pcr_ops;

@@ -27,21 +32,18 @@ extern void schedule_deferred_pcr_work(void);
 #define PCR_N2_SL1_SHIFT	27
 #define PCR_N2_OV1		0x80000000

-extern unsigned int picl_shift;
-
-/* In order to commonize as much of the implementation as
- * possible, we use PICH as our counter.  Mostly this is
- * to accommodate Niagara-1 which can only count insn cycles
- * in PICH.
- */
-static inline u64 picl_value(unsigned int nmi_hz)
-{
-	u32 delta = local_cpu_data().clock_tick / (nmi_hz << picl_shift);
-
-	return ((u64)((0 - delta) & 0xffffffff)) << 32;
-}
-
-extern u64 pcr_enable;
+#define PCR_N4_OV		0x00000001 /* PIC overflow             */
+#define PCR_N4_TOE		0x00000002 /* Trap On Event            */
+#define PCR_N4_UTRACE		0x00000004 /* Trace user events        */
+#define PCR_N4_STRACE		0x00000008 /* Trace supervisor events  */
+#define PCR_N4_HTRACE		0x00000010 /* Trace hypervisor events  */
+#define PCR_N4_MASK		0x000007e0 /* Event mask               */
+#define PCR_N4_MASK_SHIFT	5
+#define PCR_N4_SL		0x0000f800 /* Event Select             */
+#define PCR_N4_SL_SHIFT		11
+#define PCR_N4_PICNPT		0x00010000 /* PIC non-privileged trap  */
+#define PCR_N4_PICNHT		0x00020000 /* PIC non-hypervisor trap  */
+#define PCR_N4_NTC		0x00040000 /* Next-To-Commit wrap      */

 extern int pcr_arch_init(void);


--- a/arch/sparc/include/asm/perfctr.h
+++ b/arch/sparc/include/asm/perfctr.h
@@ -54,11 +54,6 @@ enum perfctr_opcode {
 	PERFCTR_GETPCR
 };

-/* I don't want the kernel's namespace to be polluted with this
- * stuff when this file is included.  --DaveM
- */
-#ifndef __KERNEL__
-
 #define  PRIV 0x00000001
 #define  SYS  0x00000002
 #define  USR  0x00000004
@@ -168,29 +163,4 @@ struct vcounter_struct {
  unsigned long long vcnt1;
 };

-#else /* !(__KERNEL__) */
-
-#ifndef CONFIG_SPARC32
-
-/* Performance counter register access. */
-#define read_pcr(__p)  __asm__ __volatile__("rd	%%pcr, %0" : "=r" (__p))
-#define write_pcr(__p) __asm__ __volatile__("wr	%0, 0x0, %%pcr" : : "r" (__p))
-#define read_pic(__p)  __asm__ __volatile__("rd %%pic, %0" : "=r" (__p))
-
-/* Blackbird errata workaround.  See commentary in
- * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt()
- * for more information.
- */
-#define write_pic(__p)  					\
-	__asm__ __volatile__("ba,pt	%%xcc, 99f\n\t"		\
-			     " nop\n\t"				\
-			     ".align	64\n"			\
-			  "99:wr	%0, 0x0, %%pic\n\t"	\
-			     "rd	%%pic, %%g0" : : "r" (__p))
-#define reset_pic()	write_pic(0)
-
-#endif /* !CONFIG_SPARC32 */
-
-#endif /* !(__KERNEL__) */
-
 #endif /* !(PERF_COUNTER_API) */
--- a/arch/sparc/include/asm/pstate.h
+++ b/arch/sparc/include/asm/pstate.h
@@ -88,4 +88,18 @@
 #define VERS_MAXTL	_AC(0x000000000000ff00,UL) /* Max Trap Level.	*/
 #define VERS_MAXWIN	_AC(0x000000000000001f,UL) /* Max RegWindow Idx.*/

+/* Compatability Feature Register (%asr26), SPARC-T4 and later  */
+#define CFR_AES		_AC(0x0000000000000001,UL) /* Supports AES opcodes     */
+#define CFR_DES		_AC(0x0000000000000002,UL) /* Supports DES opcodes     */
+#define CFR_KASUMI	_AC(0x0000000000000004,UL) /* Supports KASUMI opcodes  */
+#define CFR_CAMELLIA	_AC(0x0000000000000008,UL) /* Supports CAMELLIA opcodes*/
+#define CFR_MD5		_AC(0x0000000000000010,UL) /* Supports MD5 opcodes     */
+#define CFR_SHA1	_AC(0x0000000000000020,UL) /* Supports SHA1 opcodes    */
+#define CFR_SHA256	_AC(0x0000000000000040,UL) /* Supports SHA256 opcodes  */
+#define CFR_SHA512	_AC(0x0000000000000080,UL) /* Supports SHA512 opcodes  */
+#define CFR_MPMUL	_AC(0x0000000000000100,UL) /* Supports MPMUL opcodes   */
+#define CFR_MONTMUL	_AC(0x0000000000000200,UL) /* Supports MONTMUL opcodes */
+#define CFR_MONTSQR	_AC(0x0000000000000400,UL) /* Supports MONTSQR opcodes */
+#define CFR_CRC32C	_AC(0x0000000000000800,UL) /* Supports CRC32C opcodes  */
+
 #endif /* !(_SPARC64_PSTATE_H) */
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -559,10 +559,10 @@ niagara_tlb_fixup:
 	be,pt	%xcc, niagara2_patch
 	 nop
 	cmp	%g1, SUN4V_CHIP_NIAGARA4
-	be,pt	%xcc, niagara2_patch
+	be,pt	%xcc, niagara4_patch
 	 nop
 	cmp	%g1, SUN4V_CHIP_NIAGARA5
-	be,pt	%xcc, niagara2_patch
+	be,pt	%xcc, niagara4_patch
 	 nop

 	call	generic_patch_copyops
@@ -573,6 +573,16 @@ niagara_tlb_fixup:
 	 nop

 	ba,a,pt	%xcc, 80f
+niagara4_patch:
+	call	niagara4_patch_copyops
+	 nop
+	call	niagara_patch_bzero
+	 nop
+	call	niagara4_patch_pageops
+	 nop
+
+	ba,a,pt	%xcc, 80f
+
 niagara2_patch:
 	call	niagara2_patch_copyops
 	 nop

--- a/arch/sparc/kernel/hvapi.c
+++ b/arch/sparc/kernel/hvapi.c
@@ -45,6 +45,7 @@ static struct api_info api_table[] = {
 	{ .group = HV_GRP_NIU,					},
 	{ .group = HV_GRP_VF_CPU,				},
 	{ .group = HV_GRP_KT_CPU,				},
+	{ .group = HV_GRP_VT_CPU,				},
 	{ .group = HV_GRP_DIAG,		.flags = FLAG_PRE_API	},
 };


--- a/arch/sparc/kernel/hvcalls.S
+++ b/arch/sparc/kernel/hvcalls.S
@@ -805,3 +805,19 @@ ENTRY(sun4v_reboot_data_set)
 	retl
 	 nop
 ENDPROC(sun4v_reboot_data_set)
+
+ENTRY(sun4v_vt_get_perfreg)
+	mov	%o1, %o4
+	mov	HV_FAST_VT_GET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	stx	%o1, [%o4]
+	retl
+	 nop
+ENDPROC(sun4v_vt_get_perfreg)
+
+ENTRY(sun4v_vt_set_perfreg)
+	mov	HV_FAST_VT_SET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+ENDPROC(sun4v_vt_set_perfreg)
--- a/arch/sparc/kernel/ktlb.S
+++ b/arch/sparc/kernel/ktlb.S
@@ -188,31 +188,26 @@ valid_addr_bitmap_patch:
 	be,pn		%xcc, kvmap_dtlb_longpath

 2:	 sethi		%hi(kpte_linear_bitmap), %g2
-	or		%g2, %lo(kpte_linear_bitmap), %g2

 	/* Get the 256MB physical address index. */
 	sllx		%g4, 21, %g5
-	mov		1, %g7
+	or		%g2, %lo(kpte_linear_bitmap), %g2
 	srlx		%g5, 21 + 28, %g5
+	and		%g5, (32 - 1), %g7

-	/* Don't try this at home kids... this depends upon srlx
-	 * only taking the low 6 bits of the shift count in %g5.
-	 */
-	sllx		%g7, %g5, %g7
-
-	/* Divide by 64 to get the offset into the bitmask.  */
-	srlx		%g5, 6, %g5
+	/* Divide by 32 to get the offset into the bitmask.  */
+	srlx		%g5, 5, %g5
+	add		%g7, %g7, %g7
 	sllx		%g5, 3, %g5

-	/* kern_linear_pte_xor[((mask & bit) ? 1 : 0)] */
+	/* kern_linear_pte_xor[(mask >> shift) & 3)] */
 	ldx		[%g2 + %g5], %g2
-	andcc		%g2, %g7, %g0
+	srlx		%g2, %g7, %g7
 	sethi		%hi(kern_linear_pte_xor), %g5
+	and		%g7, 3, %g7
 	or		%g5, %lo(kern_linear_pte_xor), %g5
-	bne,a,pt	%xcc, 1f
-	 add		%g5, 8, %g5
-
-1:	ldx		[%g5], %g2
+	sllx		%g7, 3, %g7
+	ldx		[%g5 + %g7], %g2

 	.globl		kvmap_linear_patch
 kvmap_linear_patch:

--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -817,6 +817,30 @@ void __cpuinit mdesc_populate_present_mask(cpumask_t *mask)
 	mdesc_iterate_over_cpus(record_one_cpu, NULL, mask);
 }

+static void * __init check_one_pgsz(struct mdesc_handle *hp, u64 mp, int cpuid, void *arg)
+{
+	const u64 *pgsz_prop = mdesc_get_property(hp, mp, "mmu-page-size-list", NULL);
+	unsigned long *pgsz_mask = arg;
+	u64 val;
+
+	val = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K |
+	       HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB);
+	if (pgsz_prop)
+		val = *pgsz_prop;
+
+	if (!*pgsz_mask)
+		*pgsz_mask = val;
+	else
+		*pgsz_mask &= val;
+	return NULL;
+}
+
+void __init mdesc_get_page_sizes(cpumask_t *mask, unsigned long *pgsz_mask)
+{
+	*pgsz_mask = 0;
+	mdesc_iterate_over_cpus(check_one_pgsz, pgsz_mask, mask);
+}
+
 static void * __cpuinit fill_in_one_cpu(struct mdesc_handle *hp, u64 mp, int cpuid, void *arg)
 {
 	const u64 *cfreq = mdesc_get_property(hp, mp, "clock-frequency", NULL);

--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -22,7 +22,6 @@
 #include <asm/perf_event.h>
 #include <asm/ptrace.h>
 #include <asm/pcr.h>
-#include <asm/perfctr.h>

 #include "kstack.h"

@@ -109,7 +108,7 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 		       pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP)
 		touched = 1;
 	else
-		pcr_ops->write(PCR_PIC_PRIV);
+		pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);

 	sum = local_cpu_data().irq0_irqs;
 	if (__get_cpu_var(nmi_touch)) {
@@ -126,8 +125,8 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 		__this_cpu_write(alert_counter, 0);
 	}
 	if (__get_cpu_var(wd_enabled)) {
-		write_pic(picl_value(nmi_hz));
-		pcr_ops->write(pcr_enable);
+		pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
+		pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
 	}

 	restore_hardirq_stack(orig_sp);
@@ -166,7 +165,7 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)

 void stop_nmi_watchdog(void *unused)
 {
-	pcr_ops->write(PCR_PIC_PRIV);
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
 	__get_cpu_var(wd_enabled) = 0;
 	atomic_dec(&nmi_active);
 }
@@ -223,10 +222,10 @@ void start_nmi_watchdog(void *unused)
 	__get_cpu_var(wd_enabled) = 1;
 	atomic_inc(&nmi_active);

-	pcr_ops->write(PCR_PIC_PRIV);
-	write_pic(picl_value(nmi_hz));
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
+	pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));

-	pcr_ops->write(pcr_enable);
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
 }

 static void nmi_adjust_hz_one(void *unused)
@@ -234,10 +233,10 @@ static void nmi_adjust_hz_one(void *unused)
 	if (!__get_cpu_var(wd_enabled))
 		return;

-	pcr_ops->write(PCR_PIC_PRIV);
-	write_pic(picl_value(nmi_hz));
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
+	pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));

-	pcr_ops->write(pcr_enable);
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
 }

 void nmi_adjust_hz(unsigned int new_hz)

--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -594,7 +594,7 @@ static int __devinit pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
 		printk(KERN_ERR PFX "Strange virtual-dma[%08x:%08x].\n",
 		       vdma[0], vdma[1]);
 		return -EINVAL;
-	};
+	}

 	dma_mask = (roundup_pow_of_two(vdma[1]) - 1UL);
 	num_tsb_entries = vdma[1] / IO_PAGE_SIZE;

--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -13,23 +13,14 @@
 #include <asm/pil.h>
 #include <asm/pcr.h>
 #include <asm/nmi.h>
+#include <asm/asi.h>
 #include <asm/spitfire.h>
-#include <asm/perfctr.h>

 /* This code is shared between various users of the performance
 * counters.  Users will be oprofile, pseudo-NMI watchdog, and the
 * perf_event support layer.
 */

-#define PCR_SUN4U_ENABLE	(PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE)
-#define PCR_N2_ENABLE		(PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE | \
-				 PCR_N2_TOE_OV1 | \
-				 (2 << PCR_N2_SL1_SHIFT) | \
-				 (0xff << PCR_N2_MASK1_SHIFT))
-
-u64 pcr_enable;
-unsigned int picl_shift;
-
 /* Performance counter interrupts run unmasked at PIL level 15.
 * Therefore we can't do things like wakeups and other work
 * that expects IRQ disabling to be adhered to in locking etc.
@@ -60,39 +51,144 @@ void arch_irq_work_raise(void)
 const struct pcr_ops *pcr_ops;
 EXPORT_SYMBOL_GPL(pcr_ops);

-static u64 direct_pcr_read(void)
+static u64 direct_pcr_read(unsigned long reg_num)
 {
 	u64 val;

-	read_pcr(val);
+	WARN_ON_ONCE(reg_num != 0);
+	__asm__ __volatile__("rd %%pcr, %0" : "=r" (val));
 	return val;
 }

-static void direct_pcr_write(u64 val)
+static void direct_pcr_write(unsigned long reg_num, u64 val)
+{
+	WARN_ON_ONCE(reg_num != 0);
+	__asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (val));
+}
+
+static u64 direct_pic_read(unsigned long reg_num)
 {
-	write_pcr(val);
+	u64 val;
+
+	WARN_ON_ONCE(reg_num != 0);
+	__asm__ __volatile__("rd %%pic, %0" : "=r" (val));
+	return val;
+}
+
+static void direct_pic_write(unsigned long reg_num, u64 val)
+{
+	WARN_ON_ONCE(reg_num != 0);
+
+	/* Blackbird errata workaround.  See commentary in
+	 * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt()
+	 * for more information.
+	 */
+	__asm__ __volatile__("ba,pt	%%xcc, 99f\n\t"
+			     " nop\n\t"
+			     ".align	64\n"
+			  "99:wr	%0, 0x0, %%pic\n\t"
+			     "rd	%%pic, %%g0" : : "r" (val));
+}
+
+static u64 direct_picl_value(unsigned int nmi_hz)
+{
+	u32 delta = local_cpu_data().clock_tick / nmi_hz;
+
+	return ((u64)((0 - delta) & 0xffffffff)) << 32;
 }

 static const struct pcr_ops direct_pcr_ops = {
-	.read	= direct_pcr_read,
-	.write	= direct_pcr_write,
+	.read_pcr		= direct_pcr_read,
+	.write_pcr		= direct_pcr_write,
+	.read_pic		= direct_pic_read,
+	.write_pic		= direct_pic_write,
+	.nmi_picl_value		= direct_picl_value,
+	.pcr_nmi_enable		= (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE),
+	.pcr_nmi_disable	= PCR_PIC_PRIV,
 };

-static void n2_pcr_write(u64 val)
+static void n2_pcr_write(unsigned long reg_num, u64 val)
 {
 	unsigned long ret;

+	WARN_ON_ONCE(reg_num != 0);
 	if (val & PCR_N2_HTRACE) {
 		ret = sun4v_niagara2_setperf(HV_N2_PERF_SPARC_CTL, val);
 		if (ret != HV_EOK)
-			write_pcr(val);
+			direct_pcr_write(reg_num, val);
 	} else
-		write_pcr(val);
+		direct_pcr_write(reg_num, val);
+}
+
+static u64 n2_picl_value(unsigned int nmi_hz)
+{
+	u32 delta = local_cpu_data().clock_tick / (nmi_hz << 2);
+
+	return ((u64)((0 - delta) & 0xffffffff)) << 32;
 }

 static const struct pcr_ops n2_pcr_ops = {
-	.read	= direct_pcr_read,
-	.write	= n2_pcr_write,
+	.read_pcr		= direct_pcr_read,
+	.write_pcr		= n2_pcr_write,
+	.read_pic		= direct_pic_read,
+	.write_pic		= direct_pic_write,
+	.nmi_picl_value		= n2_picl_value,
+	.pcr_nmi_enable		= (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE |
+				   PCR_N2_TOE_OV1 |
+				   (2 << PCR_N2_SL1_SHIFT) |
+				   (0xff << PCR_N2_MASK1_SHIFT)),
+	.pcr_nmi_disable	= PCR_PIC_PRIV,
+};
+
+static u64 n4_pcr_read(unsigned long reg_num)
+{
+	unsigned long val;
+
+	(void) sun4v_vt_get_perfreg(reg_num, &val);
+
+	return val;
+}
+
+static void n4_pcr_write(unsigned long reg_num, u64 val)
+{
+	(void) sun4v_vt_set_perfreg(reg_num, val);
+}
+
+static u64 n4_pic_read(unsigned long reg_num)
+{
+	unsigned long val;
+
+	__asm__ __volatile__("ldxa [%1] %2, %0"
+			     : "=r" (val)
+			     : "r" (reg_num * 0x8UL), "i" (ASI_PIC));
+
+	return val;
+}
+
+static void n4_pic_write(unsigned long reg_num, u64 val)
+{
+	__asm__ __volatile__("stxa %0, [%1] %2"
+			     : /* no outputs */
+			     : "r" (val), "r" (reg_num * 0x8UL), "i" (ASI_PIC));
+}
+
+static u64 n4_picl_value(unsigned int nmi_hz)
+{
+	u32 delta = local_cpu_data().clock_tick / (nmi_hz << 2);
+
+	return ((u64)((0 - delta) & 0xffffffff));
+}
+
+static const struct pcr_ops n4_pcr_ops = {
+	.read_pcr		= n4_pcr_read,
+	.write_pcr		= n4_pcr_write,
+	.read_pic		= n4_pic_read,
+	.write_pic		= n4_pic_write,
+	.nmi_picl_value		= n4_picl_value,
+	.pcr_nmi_enable		= (PCR_N4_PICNPT | PCR_N4_STRACE |
+				   PCR_N4_UTRACE | PCR_N4_TOE |
+				   (26 << PCR_N4_SL_SHIFT)),
+	.pcr_nmi_disable	= PCR_N4_PICNPT,
 };

 static unsigned long perf_hsvc_group;
@@ -115,6 +211,10 @@ static int __init register_perf_hsvc(void)
 			perf_hsvc_group = HV_GRP_KT_CPU;
 			break;

+		case SUN4V_CHIP_NIAGARA4:
+			perf_hsvc_group = HV_GRP_VT_CPU;
+			break;
+
 		default:
 			return -ENODEV;
 		}
@@ -139,6 +239,29 @@ static void __init unregister_perf_hsvc(void)
 	sun4v_hvapi_unregister(perf_hsvc_group);
 }

+static int __init setup_sun4v_pcr_ops(void)
+{
+	int ret = 0;
+
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_NIAGARA1:
+	case SUN4V_CHIP_NIAGARA2:
+	case SUN4V_CHIP_NIAGARA3:
+		pcr_ops = &n2_pcr_ops;
+		break;
+
+	case SUN4V_CHIP_NIAGARA4:
+		pcr_ops = &n4_pcr_ops;
+		break;
+
+	default:
+		ret = -ENODEV;
+		break;
+	}
+
+	return ret;
+}
+
 int __init pcr_arch_init(void)
 {
 	int err = register_perf_hsvc();
@@ -148,15 +271,14 @@ int __init pcr_arch_init(void)

 	switch (tlb_type) {
 	case hypervisor:
-		pcr_ops = &n2_pcr_ops;
-		pcr_enable = PCR_N2_ENABLE;
-		picl_shift = 2;
+		err = setup_sun4v_pcr_ops();
+		if (err)
+			goto out_unregister;
 		break;

 	case cheetah:
 	case cheetah_plus:
 		pcr_ops = &direct_pcr_ops;
-		pcr_enable = PCR_SUN4U_ENABLE;
 		break;

 	case spitfire:

--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -25,36 +25,48 @@
 #include <linux/atomic.h>
 #include <asm/nmi.h>
 #include <asm/pcr.h>
-#include <asm/perfctr.h>
 #include <asm/cacheflush.h>

 #include "kernel.h"
 #include "kstack.h"

-/* Sparc64 chips have two performance counters, 32-bits each, with
- * overflow interrupts generated on transition from 0xffffffff to 0.
- * The counters are accessed in one go using a 64-bit register.
+/* Two classes of sparc64 chips currently exist.  All of which have
+ * 32-bit counters which can generate overflow interrupts on the
+ * transition from 0xffffffff to 0.
 *
- * Both counters are controlled using a single control register.  The
- * only way to stop all sampling is to clear all of the context (user,
- * supervisor, hypervisor) sampling enable bits.  But these bits apply
- * to both counters, thus the two counters can't be enabled/disabled
- * individually.
+ * All chips upto and including SPARC-T3 have two performance
+ * counters.  The two 32-bit counters are accessed in one go using a
+ * single 64-bit register.
 *
- * The control register has two event fields, one for each of the two
- * counters.  It's thus nearly impossible to have one counter going
- * while keeping the other one stopped.  Therefore it is possible to
- * get overflow interrupts for counters not currently "in use" and
- * that condition must be checked in the overflow interrupt handler.
+ * On these older chips both counters are controlled using a single
+ * control register.  The only way to stop all sampling is to clear
+ * all of the context (user, supervisor, hypervisor) sampling enable
+ * bits.  But these bits apply to both counters, thus the two counters
+ * can't be enabled/disabled individually.
+ *
+ * Furthermore, the control register on these older chips have two
+ * event fields, one for each of the two counters.  It's thus nearly
+ * impossible to have one counter going while keeping the other one
+ * stopped.  Therefore it is possible to get overflow interrupts for
+ * counters not currently "in use" and that condition must be checked
+ * in the overflow interrupt handler.
 *
 * So we use a hack, in that we program inactive counters with the
 * "sw_count0" and "sw_count1" events.  These count how many times
 * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
 * unusual way to encode a NOP and therefore will not trigger in
 * normal code.
+ *
+ * Starting with SPARC-T4 we have one control register per counter.
+ * And the counters are stored in individual registers.  The registers
+ * for the counters are 64-bit but only a 32-bit counter is
+ * implemented.  The event selections on SPARC-T4 lack any
+ * restrictions, therefore we can elide all of the complicated
+ * conflict resolution code we have for SPARC-T3 and earlier chips.
 */

-#define MAX_HWEVENTS			2
+#define MAX_HWEVENTS			4
+#define MAX_PCRS			4
 #define MAX_PERIOD			((1UL << 32) - 1)

 #define PIC_UPPER_INDEX			0
@@ -90,8 +102,8 @@ struct cpu_hw_events {
 	 */
 	int			current_idx[MAX_HWEVENTS];

-	/* Software copy of %pcr register on this cpu.  */
-	u64			pcr;
+	/* Software copy of %pcr register(s) on this cpu.  */
+	u64			pcr[MAX_HWEVENTS];

 	/* Enabled/disable state.  */
 	int			enabled;
@@ -103,6 +115,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, };
 /* An event map describes the characteristics of a performance
 * counter event.  In particular it gives the encoding as well as
 * a mask telling which counters the event can be measured on.
+ *
+ * The mask is unused on SPARC-T4 and later.
 */
 struct perf_event_map {
 	u16	encoding;
@@ -142,15 +156,53 @@ struct sparc_pmu {
 	const struct perf_event_map	*(*event_map)(int);
 	const cache_map_t		*cache_map;
 	int				max_events;
+	u32				(*read_pmc)(int);
+	void				(*write_pmc)(int, u64);
 	int				upper_shift;
 	int				lower_shift;
 	int				event_mask;
+	int				user_bit;
+	int				priv_bit;
 	int				hv_bit;
 	int				irq_bit;
 	int				upper_nop;
 	int				lower_nop;
+	unsigned int			flags;
+#define SPARC_PMU_ALL_EXCLUDES_SAME	0x00000001
+#define SPARC_PMU_HAS_CONFLICTS		0x00000002
+	int				max_hw_events;
+	int				num_pcrs;
+	int				num_pic_regs;
 };

+static u32 sparc_default_read_pmc(int idx)
+{
+	u64 val;
+
+	val = pcr_ops->read_pic(0);
+	if (idx == PIC_UPPER_INDEX)
+		val >>= 32;
+
+	return val & 0xffffffff;
+}
+
+static void sparc_default_write_pmc(int idx, u64 val)
+{
+	u64 shift, mask, pic;
+
+	shift = 0;
+	if (idx == PIC_UPPER_INDEX)
+		shift = 32;
+
+	mask = ((u64) 0xffffffff) << shift;
+	val <<= shift;
+
+	pic = pcr_ops->read_pic(0);
+	pic &= ~mask;
+	pic |= val;
+	pcr_ops->write_pic(0, pic);
+}
+
 static const struct perf_event_map ultra3_perfmon_event_map[] = {
 	[PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
 	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
@@ -268,11 +320,20 @@ static const struct sparc_pmu ultra3_pmu = {
 	.event_map	= ultra3_event_map,
 	.cache_map	= &ultra3_cache_map,
 	.max_events	= ARRAY_SIZE(ultra3_perfmon_event_map),
+	.read_pmc	= sparc_default_read_pmc,
+	.write_pmc	= sparc_default_write_pmc,
 	.upper_shift	= 11,
 	.lower_shift	= 4,
 	.event_mask	= 0x3f,
+	.user_bit	= PCR_UTRACE,
+	.priv_bit	= PCR_STRACE,
 	.upper_nop	= 0x1c,
 	.lower_nop	= 0x14,
+	.flags		= (SPARC_PMU_ALL_EXCLUDES_SAME |
+			   SPARC_PMU_HAS_CONFLICTS),
+	.max_hw_events	= 2,
+	.num_pcrs	= 1,
+	.num_pic_regs	= 1,
 };

 /* Niagara1 is very limited.  The upper PIC is hard-locked to count
@@ -397,11 +458,20 @@ static const struct sparc_pmu niagara1_pmu = {
 	.event_map	= niagara1_event_map,
 	.cache_map	= &niagara1_cache_map,
 	.max_events	= ARRAY_SIZE(niagara1_perfmon_event_map),
+	.read_pmc	= sparc_default_read_pmc,
+	.write_pmc	= sparc_default_write_pmc,
 	.upper_shift	= 0,
 	.lower_shift	= 4,
 	.event_mask	= 0x7,
+	.user_bit	= PCR_UTRACE,
+	.priv_bit	= PCR_STRACE,
 	.upper_nop	= 0x0,
 	.lower_nop	= 0x0,
+	.flags		= (SPARC_PMU_ALL_EXCLUDES_SAME |
+			   SPARC_PMU_HAS_CONFLICTS),
+	.max_hw_events	= 2,
+	.num_pcrs	= 1,
+	.num_pic_regs	= 1,
 };

 static const struct perf_event_map niagara2_perfmon_event_map[] = {
@@ -523,13 +593,203 @@ static const struct sparc_pmu niagara2_pmu = {
 	.event_map	= niagara2_event_map,
 	.cache_map	= &niagara2_cache_map,
 	.max_events	= ARRAY_SIZE(niagara2_perfmon_event_map),
+	.read_pmc	= sparc_default_read_pmc,
+	.write_pmc	= sparc_default_write_pmc,
 	.upper_shift	= 19,
 	.lower_shift	= 6,
 	.event_mask	= 0xfff,
-	.hv_bit		= 0x8,
+	.user_bit	= PCR_UTRACE,
+	.priv_bit	= PCR_STRACE,
+	.hv_bit		= PCR_N2_HTRACE,
 	.irq_bit	= 0x30,
 	.upper_nop	= 0x220,
 	.lower_nop	= 0x220,
+	.flags		= (SPARC_PMU_ALL_EXCLUDES_SAME |
+			   SPARC_PMU_HAS_CONFLICTS),
+	.max_hw_events	= 2,
+	.num_pcrs	= 1,
+	.num_pic_regs	= 1,
+};
+
+static const struct perf_event_map niagara4_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { (26 << 6) },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { (3 << 6) | 0x3f },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { (3 << 6) | 0x04 },
+	[PERF_COUNT_HW_CACHE_MISSES] = { (16 << 6) | 0x07 },
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { (4 << 6) | 0x01 },
+	[PERF_COUNT_HW_BRANCH_MISSES] = { (25 << 6) | 0x0f },
+};
+
+static const struct perf_event_map *niagara4_event_map(int event_id)
+{
+	return &niagara4_perfmon_event_map[event_id];
+}
+
+static const cache_map_t niagara4_cache_map = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x04 },
+		[C(RESULT_MISS)] = { (16 << 6) | 0x07 },
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x08 },
+		[C(RESULT_MISS)] = { (16 << 6) | 0x07 },
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x3f },
+		[C(RESULT_MISS)] = { (11 << 6) | 0x03 },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_NONSENSE },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_NONSENSE },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x04 },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x08 },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { (17 << 6) | 0x3f },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { (6 << 6) | 0x3f },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)  ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+};
+
+static u32 sparc_vt_read_pmc(int idx)
+{
+	u64 val = pcr_ops->read_pic(idx);
+
+	return val & 0xffffffff;
+}
+
+static void sparc_vt_write_pmc(int idx, u64 val)
+{
+	u64 pcr;
+
+	/* There seems to be an internal latch on the overflow event
+	 * on SPARC-T4 that prevents it from triggering unless you
+	 * update the PIC exactly as we do here.  The requirement
+	 * seems to be that you have to turn off event counting in the
+	 * PCR around the PIC update.
+	 *
+	 * For example, after the following sequence:
+	 *
+	 * 1) set PIC to -1
+	 * 2) enable event counting and overflow reporting in PCR
+	 * 3) overflow triggers, softint 15 handler invoked
+	 * 4) clear OV bit in PCR
+	 * 5) write PIC to -1
+	 *
+	 * a subsequent overflow event will not trigger.  This
+	 * sequence works on SPARC-T3 and previous chips.
+	 */
+	pcr = pcr_ops->read_pcr(idx);
+	pcr_ops->write_pcr(idx, PCR_N4_PICNPT);
+
+	pcr_ops->write_pic(idx, val & 0xffffffff);
+
+	pcr_ops->write_pcr(idx, pcr);
+}
+
+static const struct sparc_pmu niagara4_pmu = {
+	.event_map	= niagara4_event_map,
+	.cache_map	= &niagara4_cache_map,
+	.max_events	= ARRAY_SIZE(niagara4_perfmon_event_map),
+	.read_pmc	= sparc_vt_read_pmc,
+	.write_pmc	= sparc_vt_write_pmc,
+	.upper_shift	= 5,
+	.lower_shift	= 5,
+	.event_mask	= 0x7ff,
+	.user_bit	= PCR_N4_UTRACE,
+	.priv_bit	= PCR_N4_STRACE,
+
+	/* We explicitly don't support hypervisor tracing.  The T4
+	 * generates the overflow event for precise events via a trap
+	 * which will not be generated (ie. it's completely lost) if
+	 * we happen to be in the hypervisor when the event triggers.
+	 * Essentially, the overflow event reporting is completely
+	 * unusable when you have hypervisor mode tracing enabled.
+	 */
+	.hv_bit		= 0,
+
+	.irq_bit	= PCR_N4_TOE,
+	.upper_nop	= 0,
+	.lower_nop	= 0,
+	.flags		= 0,
+	.max_hw_events	= 4,
+	.num_pcrs	= 4,
+	.num_pic_regs	= 4,
 };

 static const struct sparc_pmu *sparc_pmu __read_mostly;
@@ -558,55 +818,35 @@ static u64 nop_for_index(int idx)
 static inline void sparc_pmu_enable_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc, int idx)
 {
 	u64 val, mask = mask_for_index(idx);
+	int pcr_index = 0;

-	val = cpuc->pcr;
+	if (sparc_pmu->num_pcrs > 1)
+		pcr_index = idx;
+
+	val = cpuc->pcr[pcr_index];
 	val &= ~mask;
 	val |= hwc->config;
-	cpuc->pcr = val;
+	cpuc->pcr[pcr_index] = val;

-	pcr_ops->write(cpuc->pcr);
+	pcr_ops->write_pcr(pcr_index, cpuc->pcr[pcr_index]);
 }

 static inline void sparc_pmu_disable_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc, int idx)
 {
 	u64 mask = mask_for_index(idx);
 	u64 nop = nop_for_index(idx);
+	int pcr_index = 0;
 	u64 val;

-	val = cpuc->pcr;
+	if (sparc_pmu->num_pcrs > 1)
+		pcr_index = idx;
+
+	val = cpuc->pcr[pcr_index];
 	val &= ~mask;
 	val |= nop;
-	cpuc->pcr = val;
+	cpuc->pcr[pcr_index] = val;

-	pcr_ops->write(cpuc->pcr);
-}
-
-static u32 read_pmc(int idx)
-{
-	u64 val;
-
-	read_pic(val);
-	if (idx == PIC_UPPER_INDEX)
-		val >>= 32;
-
-	return val & 0xffffffff;
-}
-
-static void write_pmc(int idx, u64 val)
-{
-	u64 shift, mask, pic;
-
-	shift = 0;
-	if (idx == PIC_UPPER_INDEX)
-		shift = 32;
-
-	mask = ((u64) 0xffffffff) << shift;
-	val <<= shift;
-
-	read_pic(pic);
-	pic &= ~mask;
-	pic |= val;
-	write_pic(pic);
+	pcr_ops->write_pcr(pcr_index, cpuc->pcr[pcr_index]);
 }

 static u64 sparc_perf_event_update(struct perf_event *event,
@@ -618,7 +858,7 @@ static u64 sparc_perf_event_update(struct perf_event *event,

 again:
 	prev_raw_count = local64_read(&hwc->prev_count);
-	new_raw_count = read_pmc(idx);
+	new_raw_count = sparc_pmu->read_pmc(idx);

 	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
@@ -658,25 +898,17 @@ static int sparc_perf_event_set_period(struct perf_event *event,

 	local64_set(&hwc->prev_count, (u64)-left);

-	write_pmc(idx, (u64)(-left) & 0xffffffff);
+	sparc_pmu->write_pmc(idx, (u64)(-left) & 0xffffffff);

 	perf_event_update_userpage(event);

 	return ret;
 }

-/* If performance event entries have been added, move existing
- * events around (if necessary) and then assign new entries to
- * counters.
- */
-static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
+static void read_in_all_counters(struct cpu_hw_events *cpuc)
 {
 	int i;

-	if (!cpuc->n_added)
-		goto out;
-
-	/* Read in the counters which are moving.  */
 	for (i = 0; i < cpuc->n_events; i++) {
 		struct perf_event *cp = cpuc->event[i];

@@ -687,6 +919,20 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
 			cpuc->current_idx[i] = PIC_NO_INDEX;
 		}
 	}
+}
+
+/* On this PMU all PICs are programmed using a single PCR.  Calculate
+ * the combined control register value.
+ *
+ * For such chips we require that all of the events have the same
+ * configuration, so just fetch the settings from the first entry.
+ */
+static void calculate_single_pcr(struct cpu_hw_events *cpuc)
+{
+	int i;
+
+	if (!cpuc->n_added)
+		goto out;

 	/* Assign to counters all unassigned events.  */
 	for (i = 0; i < cpuc->n_events; i++) {
@@ -702,20 +948,71 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
 		cpuc->current_idx[i] = idx;

 		enc = perf_event_get_enc(cpuc->events[i]);
-		pcr &= ~mask_for_index(idx);
+		cpuc->pcr[0] &= ~mask_for_index(idx);
 		if (hwc->state & PERF_HES_STOPPED)
-			pcr |= nop_for_index(idx);
+			cpuc->pcr[0] |= nop_for_index(idx);
 		else
-			pcr |= event_encoding(enc, idx);
+			cpuc->pcr[0] |= event_encoding(enc, idx);
 	}
 out:
-	return pcr;
+	cpuc->pcr[0] |= cpuc->event[0]->hw.config_base;
+}
+
+/* On this PMU each PIC has it's own PCR control register.  */
+static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc)
+{
+	int i;
+
+	if (!cpuc->n_added)
+		goto out;
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		struct perf_event *cp = cpuc->event[i];
+		struct hw_perf_event *hwc = &cp->hw;
+		int idx = hwc->idx;
+		u64 enc;
+
+		if (cpuc->current_idx[i] != PIC_NO_INDEX)
+			continue;
+
+		sparc_perf_event_set_period(cp, hwc, idx);
+		cpuc->current_idx[i] = idx;
+
+		enc = perf_event_get_enc(cpuc->events[i]);
+		cpuc->pcr[idx] &= ~mask_for_index(idx);
+		if (hwc->state & PERF_HES_STOPPED)
+			cpuc->pcr[idx] |= nop_for_index(idx);
+		else
+			cpuc->pcr[idx] |= event_encoding(enc, idx);
+	}
+out:
+	for (i = 0; i < cpuc->n_events; i++) {
+		struct perf_event *cp = cpuc->event[i];
+		int idx = cp->hw.idx;
+
+		cpuc->pcr[idx] |= cp->hw.config_base;
+	}
+}
+
+/* If performance event entries have been added, move existing events
+ * around (if necessary) and then assign new entries to counters.
+ */
+static void update_pcrs_for_enable(struct cpu_hw_events *cpuc)
+{
+	if (cpuc->n_added)
+		read_in_all_counters(cpuc);
+
+	if (sparc_pmu->num_pcrs == 1) {
+		calculate_single_pcr(cpuc);
+	} else {
+		calculate_multiple_pcrs(cpuc);
+	}
 }

 static void sparc_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	u64 pcr;
+	int i;

 	if (cpuc->enabled)
 		return;
@@ -723,26 +1020,17 @@ static void sparc_pmu_enable(struct pmu *pmu)
 	cpuc->enabled = 1;
 	barrier();

-	pcr = cpuc->pcr;
-	if (!cpuc->n_events) {
-		pcr = 0;
-	} else {
-		pcr = maybe_change_configuration(cpuc, pcr);
-
-		/* We require that all of the events have the same
-		 * configuration, so just fetch the settings from the
-		 * first entry.
-		 */
-		cpuc->pcr = pcr | cpuc->event[0]->hw.config_base;
-	}
+	if (cpuc->n_events)
+		update_pcrs_for_enable(cpuc);

-	pcr_ops->write(cpuc->pcr);
+	for (i = 0; i < sparc_pmu->num_pcrs; i++)
+		pcr_ops->write_pcr(i, cpuc->pcr[i]);
 }

 static void sparc_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	u64 val;
+	int i;

 	if (!cpuc->enabled)
 		return;
@@ -750,12 +1038,14 @@ static void sparc_pmu_disable(struct pmu *pmu)
 	cpuc->enabled = 0;
 	cpuc->n_added = 0;

-	val = cpuc->pcr;
-	val &= ~(PCR_UTRACE | PCR_STRACE |
-		 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
-	cpuc->pcr = val;
+	for (i = 0; i < sparc_pmu->num_pcrs; i++) {
+		u64 val = cpuc->pcr[i];

-	pcr_ops->write(cpuc->pcr);
+		val &= ~(sparc_pmu->user_bit | sparc_pmu->priv_bit |
+			 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
+		cpuc->pcr[i] = val;
+		pcr_ops->write_pcr(i, cpuc->pcr[i]);
+	}
 }

 static int active_event_index(struct cpu_hw_events *cpuc,
@@ -854,9 +1144,11 @@ static DEFINE_MUTEX(pmc_grab_mutex);
 static void perf_stop_nmi_watchdog(void *unused)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int i;

 	stop_nmi_watchdog(NULL);
-	cpuc->pcr = pcr_ops->read();
+	for (i = 0; i < sparc_pmu->num_pcrs; i++)
+		cpuc->pcr[i] = pcr_ops->read_pcr(i);
 }

 void perf_event_grab_pmc(void)
@@ -942,9 +1234,17 @@ static int sparc_check_constraints(struct perf_event **evts,
 	if (!n_ev)
 		return 0;

-	if (n_ev > MAX_HWEVENTS)
+	if (n_ev > sparc_pmu->max_hw_events)
 		return -1;

+	if (!(sparc_pmu->flags & SPARC_PMU_HAS_CONFLICTS)) {
+		int i;
+
+		for (i = 0; i < n_ev; i++)
+			evts[i]->hw.idx = i;
+		return 0;
+	}
+
 	msk0 = perf_event_get_msk(events[0]);
 	if (n_ev == 1) {
 		if (msk0 & PIC_LOWER)
@@ -1000,6 +1300,9 @@ static int check_excludes(struct perf_event **evts, int n_prev, int n_new)
 	struct perf_event *event;
 	int i, n, first;

+	if (!(sparc_pmu->flags & SPARC_PMU_ALL_EXCLUDES_SAME))
+		return 0;
+
 	n = n_prev + n_new;
 	if (n <= 1)
 		return 0;
@@ -1059,7 +1362,7 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags)
 	perf_pmu_disable(event->pmu);

 	n0 = cpuc->n_events;
-	if (n0 >= MAX_HWEVENTS)
+	if (n0 >= sparc_pmu->max_hw_events)
 		goto out;

 	cpuc->event[n0] = event;
@@ -1146,16 +1449,16 @@ static int sparc_pmu_event_init(struct perf_event *event)
 	/* We save the enable bits in the config_base.  */
 	hwc->config_base = sparc_pmu->irq_bit;
 	if (!attr->exclude_user)
-		hwc->config_base |= PCR_UTRACE;
+		hwc->config_base |= sparc_pmu->user_bit;
 	if (!attr->exclude_kernel)
-		hwc->config_base |= PCR_STRACE;
+		hwc->config_base |= sparc_pmu->priv_bit;
 	if (!attr->exclude_hv)
 		hwc->config_base |= sparc_pmu->hv_bit;

 	n = 0;
 	if (event->group_leader != event) {
 		n = collect_events(event->group_leader,
-				   MAX_HWEVENTS - 1,
+				   sparc_pmu->max_hw_events - 1,
 				   evts, events, current_idx_dmy);
 		if (n < 0)
 			return -EINVAL;
@@ -1254,8 +1557,7 @@ static struct pmu pmu = {
 void perf_event_print_debug(void)
 {
 	unsigned long flags;
-	u64 pcr, pic;
-	int cpu;
+	int cpu, i;

 	if (!sparc_pmu)
 		return;
@@ -1264,12 +1566,13 @@ void perf_event_print_debug(void)

 	cpu = smp_processor_id();

-	pcr = pcr_ops->read();
-	read_pic(pic);
-
 	pr_info("\n");
-	pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
-		cpu, pcr, pic);
+	for (i = 0; i < sparc_pmu->num_pcrs; i++)
+		pr_info("CPU#%d: PCR%d[%016llx]\n",
+			cpu, i, pcr_ops->read_pcr(i));
+	for (i = 0; i < sparc_pmu->num_pic_regs; i++)
+		pr_info("CPU#%d: PIC%d[%016llx]\n",
+			cpu, i, pcr_ops->read_pic(i));

 	local_irq_restore(flags);
 }
@@ -1305,8 +1608,9 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 	 * Do this before we peek at the counters to determine
 	 * overflow so we don't lose any events.
 	 */
-	if (sparc_pmu->irq_bit)
-		pcr_ops->write(cpuc->pcr);
+	if (sparc_pmu->irq_bit &&
+	    sparc_pmu->num_pcrs == 1)
+		pcr_ops->write_pcr(0, cpuc->pcr[0]);

 	for (i = 0; i < cpuc->n_events; i++) {
 		struct perf_event *event = cpuc->event[i];
@@ -1314,6 +1618,10 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 		struct hw_perf_event *hwc;
 		u64 val;

+		if (sparc_pmu->irq_bit &&
+		    sparc_pmu->num_pcrs > 1)
+			pcr_ops->write_pcr(idx, cpuc->pcr[idx]);
+
 		hwc = &event->hw;
 		val = sparc_perf_event_update(event, hwc, idx);
 		if (val & (1ULL << 31))
@@ -1352,6 +1660,10 @@ static bool __init supported_pmu(void)
 		sparc_pmu = &niagara2_pmu;
 		return true;
 	}
+	if (!strcmp(sparc_pmu_type, "niagara4")) {
+		sparc_pmu = &niagara4_pmu;
+		return true;
+	}
 	return false;
 }


--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -340,7 +340,12 @@ static const char *hwcaps[] = {
 	 */
 	"mul32", "div32", "fsmuld", "v8plus", "popc", "vis", "vis2",
 	"ASIBlkInit", "fmaf", "vis3", "hpc", "random", "trans", "fjfmau",
-	"ima", "cspare",
+	"ima", "cspare", "pause", "cbcond",
+};
+
+static const char *crypto_hwcaps[] = {
+	"aes", "des", "kasumi", "camellia", "md5", "sha1", "sha256",
+	"sha512", "mpmul", "montmul", "montsqr", "crc32c",
 };

 void cpucap_info(struct seq_file *m)
@@ -357,27 +362,61 @@ void cpucap_info(struct seq_file *m)
 			printed++;
 		}
 	}
+	if (caps & HWCAP_SPARC_CRYPTO) {
+		unsigned long cfr;
+
+		__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+		for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) {
+			unsigned long bit = 1UL << i;
+			if (cfr & bit) {
+				seq_printf(m, "%s%s",
+					   printed ? "," : "", crypto_hwcaps[i]);
+				printed++;
+			}
+		}
+	}
 	seq_putc(m, '\n');
 }

+static void __init report_one_hwcap(int *printed, const char *name)
+{
+	if ((*printed) == 0)
+		printk(KERN_INFO "CPU CAPS: [");
+	printk(KERN_CONT "%s%s",
+	       (*printed) ? "," : "", name);
+	if (++(*printed) == 8) {
+		printk(KERN_CONT "]\n");
+		*printed = 0;
+	}
+}
+
+static void __init report_crypto_hwcaps(int *printed)
+{
+	unsigned long cfr;
+	int i;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+
+	for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) {
+		unsigned long bit = 1UL << i;
+		if (cfr & bit)
+			report_one_hwcap(printed, crypto_hwcaps[i]);
+	}
+}
+
 static void __init report_hwcaps(unsigned long caps)
 {
 	int i, printed = 0;

-	printk(KERN_INFO "CPU CAPS: [");
 	for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
 		unsigned long bit = 1UL << i;
-		if (caps & bit) {
-			printk(KERN_CONT "%s%s",
-			       printed ? "," : "", hwcaps[i]);
-			if (++printed == 8) {
-				printk(KERN_CONT "]\n");
-				printk(KERN_INFO "CPU CAPS: [");
-				printed = 0;
-			}
-		}
+		if (caps & bit)
+			report_one_hwcap(&printed, hwcaps[i]);
 	}
-	printk(KERN_CONT "]\n");
+	if (caps & HWCAP_SPARC_CRYPTO)
+		report_crypto_hwcaps(&printed);
+	if (printed != 0)
+		printk(KERN_CONT "]\n");
 }

 static unsigned long __init mdesc_cpu_hwcap_list(void)
@@ -411,6 +450,10 @@ static unsigned long __init mdesc_cpu_hwcap_list(void)
 				break;
 			}
 		}
+		for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) {
+			if (!strcmp(prop, crypto_hwcaps[i]))
+				caps |= HWCAP_SPARC_CRYPTO;
+		}

 		plen = strlen(prop) + 1;
 		prop += plen;

--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -32,6 +32,9 @@ lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o
 lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o
 lib-$(CONFIG_SPARC64) +=  NG2patch.o

+lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o
+lib-$(CONFIG_SPARC64) +=  NG4patch.o NG4copy_page.o
+
 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o


--- a/arch/sparc/lib/NG4copy_from_user.S
+++ b/arch/sparc/lib/NG4copy_from_user.S
+/* NG4copy_from_user.S: Niagara-4 optimized copy from userspace.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_LD(x)		\
+98:	x;			\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, __retl_one_asi;\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#define FUNC_NAME		NG4copy_from_user
+#define LOAD(type,addr,dest)	type##a [addr] %asi, dest
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, ___copy_in_user;		\
+	 nop
+#endif
+
+#include "NG4memcpy.S"
--- a/arch/sparc/lib/NG4copy_page.S
+++ b/arch/sparc/lib/NG4copy_page.S
+/* NG4copy_page.S: Niagara-4 optimized copy page.
+ *
+ * Copyright (C) 2012 (davem@davemloft.net)
+ */
+
+#include <asm/asi.h>
+#include <asm/page.h>
+
+	.text
+	.align		32
+
+	.register	%g2, #scratch
+	.register	%g3, #scratch
+
+	.globl		NG4copy_user_page
+NG4copy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
+	prefetch	[%o1 + 0x000], #n_reads_strong
+	prefetch	[%o1 + 0x040], #n_reads_strong
+	prefetch	[%o1 + 0x080], #n_reads_strong
+	prefetch	[%o1 + 0x0c0], #n_reads_strong
+	set		PAGE_SIZE, %g7
+	prefetch	[%o1 + 0x100], #n_reads_strong
+	prefetch	[%o1 + 0x140], #n_reads_strong
+	prefetch	[%o1 + 0x180], #n_reads_strong
+	prefetch	[%o1 + 0x1c0], #n_reads_strong
+1:
+	ldx		[%o1 + 0x00], %o2
+	subcc		%g7, 0x40, %g7
+	ldx		[%o1 + 0x08], %o3
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	ldx		[%o1 + 0x20], %g1
+	stxa		%o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x28], %g2
+	stxa		%o3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x30], %g3
+	stxa		%o4, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x38], %o2
+	add		%o1, 0x40, %o1
+	stxa		%o5, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%g1, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%g2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%g3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 prefetch	[%o1 + 0x200], #n_reads_strong
+	retl
+	 membar		#StoreLoad | #StoreStore
+	.size		NG4copy_user_page,.-NG4copy_user_page
--- a/arch/sparc/lib/NG4copy_to_user.S
+++ b/arch/sparc/lib/NG4copy_to_user.S
+/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_ST(x)		\
+98:	x;			\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, __retl_one_asi;\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
+#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
+#endif
+
+#define FUNC_NAME		NG4copy_to_user
+#define STORE(type,src,addr)	type##a src, [addr] %asi
+#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_AIUS
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, ___copy_in_user;		\
+	 nop
+#endif
+
+#include "NG4memcpy.S"
--- a/arch/sparc/lib/NG4memcpy.S
+++ b/arch/sparc/lib/NG4memcpy.S
+/* NG4memcpy.S: Niagara-4 optimized memcpy.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#define GLOBAL_SPARE	%g7
+#else
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define FPRS_FEF  0x04
+
+/* On T4 it is very expensive to access ASRs like %fprs and
+ * %asi, avoiding a read or a write can save ~50 cycles.
+ */
+#define FPU_ENTER			\
+	rd	%fprs, %o5;		\
+	andcc	%o5, FPRS_FEF, %g0;	\
+	be,a,pn	%icc, 999f;		\
+	 wr	%g0, FPRS_FEF, %fprs;	\
+	999:
+
+#ifdef MEMCPY_DEBUG
+#define VISEntryHalf FPU_ENTER; \
+		     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#else
+#define VISEntryHalf FPU_ENTER
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#endif
+
+#define GLOBAL_SPARE	%g5
+#endif
+
+#ifndef STORE_ASI
+#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
+#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
+#else
+#define STORE_ASI	0x80		/* ASI_P */
+#endif
+#endif
+
+#ifndef EX_LD
+#define EX_LD(x)	x
+#endif
+
+#ifndef EX_ST
+#define EX_ST(x)	x
+#endif
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+#endif
+
+#ifndef LOAD
+#define LOAD(type,addr,dest)	type [addr], dest
+#endif
+
+#ifndef STORE
+#ifndef MEMCPY_DEBUG
+#define STORE(type,src,addr)	type src, [addr]
+#else
+#define STORE(type,src,addr)	type##a src, [addr] %asi
+#endif
+#endif
+
+#ifndef STORE_INIT
+#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
+#endif
+
+#ifndef FUNC_NAME
+#define FUNC_NAME	NG4memcpy
+#endif
+#ifndef PREAMBLE
+#define PREAMBLE
+#endif
+
+#ifndef XCC
+#define XCC xcc
+#endif
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+	.text
+	.align		64
+
+	.globl	FUNC_NAME
+	.type	FUNC_NAME,#function
+FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
+#ifdef MEMCPY_DEBUG
+	wr		%g0, 0x80, %asi
+#endif
+	srlx		%o2, 31, %g2
+	cmp		%g2, 0
+	tne		%XCC, 5
+	PREAMBLE
+	mov		%o0, %o3
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 3
+	ble,pn		%icc, .Ltiny
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall
+	 or		%o0, %o1, %g2
+	cmp		%o2, 128
+	bl,pn		%icc, .Lmedium
+	 nop
+
+.Llarge:/* len >= 0x80 */
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 51f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+
+51:	LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	/* Check if we can use the straight fully aligned
+	 * loop, or we require the alignaddr/faligndata variant.
+	 */
+	andcc		%o1, 0x7, %o5
+	bne,pn		%icc, .Llarge_src_unaligned
+	 sub		%g0, %o0, %g1
+
+	/* Legitimize the use of initializing stores by getting dest
+	 * to be 64-byte aligned.
+	 */
+	and		%g1, 0x3f, %g1
+	brz,pt		%g1, .Llarge_aligned
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
+	add		%o1, 8, %o1
+	subcc		%g1, 8, %g1
+	add		%o0, 8, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g2, %o0 - 0x08))
+
+.Llarge_aligned:
+	/* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x40, %o1
+	EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
+	EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
+	EX_ST(STORE_INIT(%g1, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
+	EX_ST(STORE_INIT(%o5, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	membar		#StoreLoad | #StoreStore
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_noprefetch
+
+.Lexit:	retl
+	 mov		EX_RETVAL(%o3), %o0
+
+.Llarge_src_unaligned:
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+	VISEntryHalf
+	alignaddr	%o1, %g0, %g1
+	add		%o1, %o4, %o1
+	EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
+1:	EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
+	EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
+	EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
+	EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
+	EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
+	EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
+	faligndata	%f0, %f2, %f16
+	EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
+	faligndata	%f2, %f4, %f18
+	add		%g1, 0x40, %g1
+	faligndata	%f4, %f6, %f20
+	faligndata	%f6, %f8, %f22
+	faligndata	%f8, %f10, %f24
+	faligndata	%f10, %f12, %f26
+	faligndata	%f12, %f14, %f28
+	faligndata	%f14, %f0, %f30
+	EX_ST(STORE(std, %f16, %o0 + 0x00))
+	EX_ST(STORE(std, %f18, %o0 + 0x08))
+	EX_ST(STORE(std, %f20, %o0 + 0x10))
+	EX_ST(STORE(std, %f22, %o0 + 0x18))
+	EX_ST(STORE(std, %f24, %o0 + 0x20))
+	EX_ST(STORE(std, %f26, %o0 + 0x28))
+	EX_ST(STORE(std, %f28, %o0 + 0x30))
+	EX_ST(STORE(std, %f30, %o0 + 0x38))
+	add		%o0, 0x40, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
+	VISExitHalf
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_unaligned
+
+.Lmedium:
+	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, .Lmedium_unaligned
+	 nop
+.Lmedium_noprefetch:
+	andncc		%o2, 0x20 - 1, %o5
+	be,pn		%icc, 2f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
+	EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
+	add		%o1, 0x20, %o1
+	subcc		%o5, 0x20, %o5
+	EX_ST(STORE(stx, %g1, %o0 + 0x00))
+	EX_ST(STORE(stx, %g2, %o0 + 0x08))
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
+	EX_ST(STORE(stx, %o4, %o0 + 0x18))
+	bne,pt		%icc, 1b
+	 add		%o0, 0x20, %o0
+2:	andcc		%o2, 0x18, %o5
+	be,pt		%icc, 3f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x08, %o1
+	add		%o0, 0x08, %o0
+	subcc		%o5, 0x08, %o5
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g1, %o0 - 0x08))
+3:	brz,pt		%o2, .Lexit
+	 cmp		%o2, 0x04
+	bl,pn		%icc, .Ltiny
+	 nop
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	add		%o0, 0x04, %o0
+	subcc		%o2, 0x04, %o2
+	bne,pn		%icc, .Ltiny
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	ba,a,pt		%icc, .Lexit
+.Lmedium_unaligned:
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 2f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+2:
+	and		%o1, 0x7, %g1
+	brz,pn		%g1, .Lmedium_noprefetch
+	 sll		%g1, 3, %g1
+	mov		64, %g2
+	sub		%g2, %g1, %g2
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
+	sllx		%o4, %g1, %o4
+	andn		%o2, 0x08 - 1, %o5
+	sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
+	add		%o1, 0x08, %o1
+	subcc		%o5, 0x08, %o5
+	srlx		%g3, %g2, GLOBAL_SPARE
+	or		GLOBAL_SPARE, %o4, GLOBAL_SPARE
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 sllx		%g3, %g1, %o4
+	srl		%g1, 3, %g1
+	add		%o1, %g1, %o1
+	brz,pn		%o2, .Lexit
+	 nop
+	ba,pt		%icc, .Lsmall_unaligned
+
+.Ltiny:
+	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x00))
+	EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x01))
+	EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
+	ba,pt		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x02))
+
+.Lsmall:
+	andcc		%g2, 0x3, %g0
+	bne,pn		%icc, .Lsmall_unaligned
+	 andn		%o2, 0x4 - 1, %o5
+	sub		%o2, %o5, %o2
+1:
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	subcc		%o5, 0x04, %o5
+	add		%o0, 0x04, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	brz,pt		%o2, .Lexit
+	 nop
+	ba,a,pt		%icc, .Ltiny
+
+.Lsmall_unaligned:
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	add		%o1, 1, %o1
+	add		%o0, 1, %o0
+	subcc		%o2, 1, %o2
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g1, %o0 - 0x01))
+	ba,a,pt		%icc, .Lexit
+	.size		FUNC_NAME, .-FUNC_NAME
--- a/arch/sparc/lib/NG4patch.S
+++ b/arch/sparc/lib/NG4patch.S
+/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant.
+ *
+ * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
+ */
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara4_patch_copyops
+	.type	niagara4_patch_copyops,#function
+niagara4_patch_copyops:
+	NG_DO_PATCH(memcpy, NG4memcpy)
+	NG_DO_PATCH(___copy_from_user, NG4copy_from_user)
+	NG_DO_PATCH(___copy_to_user, NG4copy_to_user)
+	retl
+	 nop
+	.size	niagara4_patch_copyops,.-niagara4_patch_copyops
+
+	.globl	niagara4_patch_pageops
+	.type	niagara4_patch_pageops,#function
+niagara4_patch_pageops:
+	NG_DO_PATCH(copy_user_page, NG4copy_user_page)
+	NG_DO_PATCH(_clear_page, NGclear_page)
+	NG_DO_PATCH(clear_user_page, NGclear_user_page)
+	retl
+	 nop
+	.size	niagara4_patch_pageops,.-niagara4_patch_pageops
--- a/arch/sparc/lib/NGpage.S
+++ b/arch/sparc/lib/NGpage.S
@@ -59,6 +59,8 @@ NGcopy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
 	 restore

 	.align		32
+	.globl		NGclear_page
+	.globl		NGclear_user_page
 NGclear_page:		/* %o0=dest */
 NGclear_user_page:	/* %o0=dest, %o1=vaddr */
 	rd		%asi, %g3

--- a/arch/sparc/lib/ksyms.c
+++ b/arch/sparc/lib/ksyms.c
@@ -134,6 +134,10 @@ EXPORT_SYMBOL(copy_user_page);
 void VISenter(void);
 EXPORT_SYMBOL(VISenter);

+/* CRYPTO code needs this */
+void VISenterhalf(void);
+EXPORT_SYMBOL(VISenterhalf);
+
 extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
 extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
 		unsigned long *);

--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -51,22 +51,40 @@

 #include "init_64.h"

-unsigned long kern_linear_pte_xor[2] __read_mostly;
+unsigned long kern_linear_pte_xor[4] __read_mostly;

-/* A bitmap, one bit for every 256MB of physical memory.  If the bit
- * is clear, we should use a 4MB page (via kern_linear_pte_xor[0]) else
- * if set we should use a 256MB page (via kern_linear_pte_xor[1]).
+/* A bitmap, two bits for every 256MB of physical memory.  These two
+ * bits determine what page size we use for kernel linear
+ * translations.  They form an index into kern_linear_pte_xor[].  The
+ * value in the indexed slot is XOR'd with the TLB miss virtual
+ * address to form the resulting TTE.  The mapping is:
+ *
+ *	0	==>	4MB
+ *	1	==>	256MB
+ *	2	==>	2GB
+ *	3	==>	16GB
+ *
+ * All sun4v chips support 256MB pages.  Only SPARC-T4 and later
+ * support 2GB pages, and hopefully future cpus will support the 16GB
+ * pages as well.  For slots 2 and 3, we encode a 256MB TTE xor there
+ * if these larger page sizes are not supported by the cpu.
+ *
+ * It would be nice to determine this from the machine description
+ * 'cpu' properties, but we need to have this table setup before the
+ * MDESC is initialized.
 */
 unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];

 #ifndef CONFIG_DEBUG_PAGEALLOC
-/* A special kernel TSB for 4MB and 256MB linear mappings.
- * Space is allocated for this right after the trap table
- * in arch/sparc64/kernel/head.S
+/* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings.
+ * Space is allocated for this right after the trap table in
+ * arch/sparc64/kernel/head.S
 */
 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
 #endif

+static unsigned long cpu_pgsz_mask;
+
 #define MAX_BANKS	32

 static struct linux_prom64_registers pavail[MAX_BANKS] __devinitdata;
@@ -403,6 +421,12 @@ EXPORT_SYMBOL(flush_icache_range);

 void mmu_info(struct seq_file *m)
 {
+	static const char *pgsz_strings[] = {
+		"8K", "64K", "512K", "4MB", "32MB",
+		"256MB", "2GB", "16GB",
+	};
+	int i, printed;
+
 	if (tlb_type == cheetah)
 		seq_printf(m, "MMU Type\t: Cheetah\n");
 	else if (tlb_type == cheetah_plus)
@@ -414,6 +438,17 @@ void mmu_info(struct seq_file *m)
 	else
 		seq_printf(m, "MMU Type\t: ???\n");

+	seq_printf(m, "MMU PGSZs\t: ");
+	printed = 0;
+	for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) {
+		if (cpu_pgsz_mask & (1UL << i)) {
+			seq_printf(m, "%s%s",
+				   printed ? "," : "", pgsz_strings[i]);
+			printed++;
+		}
+	}
+	seq_putc(m, '\n');
+
 #ifdef CONFIG_DEBUG_DCFLUSH
 	seq_printf(m, "DCPageFlushes\t: %d\n",
 		   atomic_read(&dcpage_flushes));
@@ -1358,32 +1393,75 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
 extern unsigned int kvmap_linear_patch[1];
 #endif /* CONFIG_DEBUG_PAGEALLOC */

-static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
+static void __init kpte_set_val(unsigned long index, unsigned long val)
 {
-	const unsigned long shift_256MB = 28;
-	const unsigned long mask_256MB = ((1UL << shift_256MB) - 1UL);
-	const unsigned long size_256MB = (1UL << shift_256MB);
+	unsigned long *ptr = kpte_linear_bitmap;

-	while (start < end) {
-		long remains;
+	val <<= ((index % (BITS_PER_LONG / 2)) * 2);
+	ptr += (index / (BITS_PER_LONG / 2));

-		remains = end - start;
-		if (remains < size_256MB)
-			break;
+	*ptr |= val;
+}

-		if (start & mask_256MB) {
-			start = (start + size_256MB) & ~mask_256MB;
-			continue;
-		}
+static const unsigned long kpte_shift_min = 28; /* 256MB */
+static const unsigned long kpte_shift_max = 34; /* 16GB */
+static const unsigned long kpte_shift_incr = 3;
+
+static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end,
+					   unsigned long shift)
+{
+	unsigned long size = (1UL << shift);
+	unsigned long mask = (size - 1UL);
+	unsigned long remains = end - start;
+	unsigned long val;
+
+	if (remains < size || (start & mask))
+		return start;
+
+	/* VAL maps:
+	 *
+	 *	shift 28 --> kern_linear_pte_xor index 1
+	 *	shift 31 --> kern_linear_pte_xor index 2
+	 *	shift 34 --> kern_linear_pte_xor index 3
+	 */
+	val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1;
+
+	remains &= ~mask;
+	if (shift != kpte_shift_max)
+		remains = size;

-		while (remains >= size_256MB) {
-			unsigned long index = start >> shift_256MB;
+	while (remains) {
+		unsigned long index = start >> kpte_shift_min;

-			__set_bit(index, kpte_linear_bitmap);
+		kpte_set_val(index, val);

-			start += size_256MB;
-			remains -= size_256MB;
+		start += 1UL << kpte_shift_min;
+		remains -= 1UL << kpte_shift_min;
+	}
+
+	return start;
+}
+
+static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
+{
+	unsigned long smallest_size, smallest_mask;
+	unsigned long s;
+
+	smallest_size = (1UL << kpte_shift_min);
+	smallest_mask = (smallest_size - 1UL);
+
+	while (start < end) {
+		unsigned long orig_start = start;
+
+		for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) {
+			start = kpte_mark_using_shift(start, end, s);
+
+			if (start != orig_start)
+				break;
 		}
+
+		if (start == orig_start)
+			start = (start + smallest_size) & ~smallest_mask;
 	}
 }

@@ -1577,13 +1655,16 @@ static void __init sun4v_ktsb_init(void)
 	ktsb_descr[0].resv = 0;

 #ifndef CONFIG_DEBUG_PAGEALLOC
-	/* Second KTSB for 4MB/256MB mappings.  */
+	/* Second KTSB for 4MB/256MB/2GB/16GB mappings.  */
 	ktsb_pa = (kern_base +
 		   ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));

 	ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB;
-	ktsb_descr[1].pgsz_mask = (HV_PGSZ_MASK_4MB |
-				   HV_PGSZ_MASK_256MB);
+	ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB |
+				    HV_PGSZ_MASK_256MB |
+				    HV_PGSZ_MASK_2GB |
+				    HV_PGSZ_MASK_16GB) &
+				   cpu_pgsz_mask);
 	ktsb_descr[1].assoc = 1;
 	ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES;
 	ktsb_descr[1].ctx_idx = 0;
@@ -1606,6 +1687,47 @@ void __cpuinit sun4v_ktsb_register(void)
 	}
 }

+static void __init sun4u_linear_pte_xor_finalize(void)
+{
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	/* This is where we would add Panther support for
+	 * 32MB and 256MB pages.
+	 */
+#endif
+}
+
+static void __init sun4v_linear_pte_xor_finalize(void)
+{
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
+		kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
+			0xfffff80000000000UL;
+		kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+					   _PAGE_P_4V | _PAGE_W_4V);
+	} else {
+		kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
+	}
+
+	if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
+		kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
+			0xfffff80000000000UL;
+		kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+					   _PAGE_P_4V | _PAGE_W_4V);
+	} else {
+		kern_linear_pte_xor[2] = kern_linear_pte_xor[1];
+	}
+
+	if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
+		kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
+			0xfffff80000000000UL;
+		kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+					   _PAGE_P_4V | _PAGE_W_4V);
+	} else {
+		kern_linear_pte_xor[3] = kern_linear_pte_xor[2];
+	}
+#endif
+}
+
 /* paging_init() sets up the page tables */

 static unsigned long last_valid_pfn;
@@ -1665,10 +1787,8 @@ void __init paging_init(void)
 		ktsb_phys_patch();
 	}

-	if (tlb_type == hypervisor) {
+	if (tlb_type == hypervisor)
 		sun4v_patch_tlb_handlers();
-		sun4v_ktsb_init();
-	}

 	/* Find available physical memory...
 	 *
@@ -1727,9 +1847,6 @@ void __init paging_init(void)

 	__flush_tlb_all();

-	if (tlb_type == hypervisor)
-		sun4v_ktsb_register();
-
 	prom_build_devicetree();
 	of_populate_present_mask();
 #ifndef CONFIG_SMP
@@ -1742,8 +1859,36 @@ void __init paging_init(void)
 #ifndef CONFIG_SMP
 		mdesc_fill_in_cpu_data(cpu_all_mask);
 #endif
+		mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask);
+
+		sun4v_linear_pte_xor_finalize();
+
+		sun4v_ktsb_init();
+		sun4v_ktsb_register();
+	} else {
+		unsigned long impl, ver;
+
+		cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K |
+				 HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB);
+
+		__asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));
+		impl = ((ver >> 32) & 0xffff);
+		if (impl == PANTHER_IMPL)
+			cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB |
+					  HV_PGSZ_MASK_256MB);
+
+		sun4u_linear_pte_xor_finalize();
 	}

+	/* Flush the TLBs and the 4M TSB so that the updated linear
+	 * pte XOR settings are realized for all mappings.
+	 */
+	__flush_tlb_all();
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
+#endif
+	__flush_tlb_all();
+
 	/* Setup bootmem... */
 	last_valid_pfn = end_pfn = bootmem_init(phys_base);

@@ -2110,6 +2255,7 @@ static void __init sun4u_pgprot_init(void)
 {
 	unsigned long page_none, page_shared, page_copy, page_readonly;
 	unsigned long page_exec_bit;
+	int i;

 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
 				_PAGE_CACHE_4U | _PAGE_P_4U |
@@ -2137,8 +2283,8 @@ static void __init sun4u_pgprot_init(void)
 	kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
 				   _PAGE_P_4U | _PAGE_W_4U);

-	/* XXX Should use 256MB on Panther. XXX */
-	kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
+	for (i = 1; i < 4; i++)
+		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];

 	_PAGE_SZBITS = _PAGE_SZBITS_4U;
 	_PAGE_ALL_SZ_BITS =  (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U |
@@ -2164,6 +2310,7 @@ static void __init sun4v_pgprot_init(void)
 {
 	unsigned long page_none, page_shared, page_copy, page_readonly;
 	unsigned long page_exec_bit;
+	int i;

 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
 				_PAGE_CACHE_4V | _PAGE_P_4V |
@@ -2185,15 +2332,8 @@ static void __init sun4v_pgprot_init(void)
 	kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
 				   _PAGE_P_4V | _PAGE_W_4V);

-#ifdef CONFIG_DEBUG_PAGEALLOC
-	kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^
-		0xfffff80000000000UL;
-#else
-	kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
-		0xfffff80000000000UL;
-#endif
-	kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
-				   _PAGE_P_4V | _PAGE_W_4V);
+	for (i = 1; i < 4; i++)
+		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];

 	pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V |
 		     __ACCESS_BITS_4V | _PAGE_E_4V);

--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -8,12 +8,12 @@
 #define MAX_PHYS_ADDRESS	(1UL << 41UL)
 #define KPTE_BITMAP_CHUNK_SZ		(256UL * 1024UL * 1024UL)
 #define KPTE_BITMAP_BYTES	\
-	((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 8)
+	((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 4)
 #define VALID_ADDR_BITMAP_CHUNK_SZ	(4UL * 1024UL * 1024UL)
 #define VALID_ADDR_BITMAP_BYTES	\
 	((MAX_PHYS_ADDRESS / VALID_ADDR_BITMAP_CHUNK_SZ) / 8)

-extern unsigned long kern_linear_pte_xor[2];
+extern unsigned long kern_linear_pte_xor[4];
 extern unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
 extern unsigned int sparc64_highest_unlocked_tlb_ent;
 extern unsigned long sparc64_kern_pri_context;

--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -336,6 +336,15 @@ config CRYPTO_CRC32C_INTEL
 	  gain performance compared with software implementation.
 	  Module will be crc32c-intel.

+config CRYPTO_CRC32C_SPARC64
+	tristate "CRC32c CRC algorithm (SPARC64)"
+	depends on SPARC64
+	select CRYPTO_HASH
+	select CRC32
+	help
+	  CRC32c CRC algorithm implemented using sparc64 crypto instructions,
+	  when available.
+
 config CRYPTO_GHASH
 	tristate "GHASH digest algorithm"
 	select CRYPTO_GF128MUL
@@ -354,6 +363,15 @@ config CRYPTO_MD5
 	help
 	  MD5 message digest algorithm (RFC1321).

+config CRYPTO_MD5_SPARC64
+	tristate "MD5 digest algorithm (SPARC64)"
+	depends on SPARC64
+	select CRYPTO_MD5
+	select CRYPTO_HASH
+	help
+	  MD5 message digest algorithm (RFC1321) implemented
+	  using sparc64 crypto instructions, when available.
+
 config CRYPTO_MICHAEL_MIC
 	tristate "Michael MIC keyed digest algorithm"
 	select CRYPTO_HASH
@@ -433,6 +451,15 @@ config CRYPTO_SHA1_SSSE3
 	  using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
 	  Extensions (AVX), when available.

+config CRYPTO_SHA1_SPARC64
+	tristate "SHA1 digest algorithm (SPARC64)"
+	depends on SPARC64
+	select CRYPTO_SHA1
+	select CRYPTO_HASH
+	help
+	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
+	  using sparc64 crypto instructions, when available.
+
 config CRYPTO_SHA256
 	tristate "SHA224 and SHA256 digest algorithm"
 	select CRYPTO_HASH
@@ -445,6 +472,15 @@ config CRYPTO_SHA256
 	  This code also includes SHA-224, a 224 bit hash with 112 bits
 	  of security against collision attacks.

+config CRYPTO_SHA256_SPARC64
+	tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
+	depends on SPARC64
+	select CRYPTO_SHA256
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using sparc64 crypto instructions, when available.
+
 config CRYPTO_SHA512
 	tristate "SHA384 and SHA512 digest algorithms"
 	select CRYPTO_HASH
@@ -457,6 +493,15 @@ config CRYPTO_SHA512
 	  This code also includes SHA-384, a 384 bit hash with 192 bits
 	  of security against collision attacks.

+config CRYPTO_SHA512_SPARC64
+	tristate "SHA384 and SHA512 digest algorithm (SPARC64)"
+	depends on SPARC64
+	select CRYPTO_SHA512
+	select CRYPTO_HASH
+	help
+	  SHA-512 secure hash standard (DFIPS 180-2) implemented
+	  using sparc64 crypto instructions, when available.
+
 config CRYPTO_TGR192
 	tristate "Tiger digest algorithms"
 	select CRYPTO_HASH
@@ -588,6 +633,34 @@ config CRYPTO_AES_NI_INTEL
 	  ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional
 	  acceleration for CTR.

+config CRYPTO_AES_SPARC64
+	tristate "AES cipher algorithms (SPARC64)"
+	depends on SPARC64
+	select CRYPTO_CRYPTD
+	select CRYPTO_ALGAPI
+	help
+	  Use SPARC64 crypto opcodes for AES algorithm.
+
+	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
+	  algorithm.
+
+	  Rijndael appears to be consistently a very good performer in
+	  both hardware and software across a wide range of computing
+	  environments regardless of its use in feedback or non-feedback
+	  modes. Its key setup time is excellent, and its key agility is
+	  good. Rijndael's very low memory requirements make it very well
+	  suited for restricted-space environments, in which it also
+	  demonstrates excellent performance. Rijndael's operations are
+	  among the easiest to defend against power and timing attacks.
+
+	  The AES specifies three key sizes: 128, 192 and 256 bits
+
+	  See <http://csrc.nist.gov/encryption/aes/> for more information.
+
+	  In addition to AES cipher algorithm support, the acceleration
+	  for some popular block cipher mode is supported too, including
+	  ECB and CBC.
+
 config CRYPTO_ANUBIS
 	tristate "Anubis cipher algorithm"
 	select CRYPTO_ALGAPI
@@ -685,6 +758,22 @@ config CRYPTO_CAMELLIA_X86_64
 	  See also:
 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>

+config CRYPTO_CAMELLIA_SPARC64
+	tristate "Camellia cipher algorithm (SPARC64)"
+	depends on SPARC64
+	depends on CRYPTO
+	select CRYPTO_ALGAPI
+	help
+	  Camellia cipher algorithm module (SPARC64).
+
+	  Camellia is a symmetric key block cipher developed jointly
+	  at NTT and Mitsubishi Electric Corporation.
+
+	  The Camellia specifies three key sizes: 128, 192 and 256 bits.
+
+	  See also:
+	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
+
 config CRYPTO_CAST5
 	tristate "CAST5 (CAST-128) cipher algorithm"
 	select CRYPTO_ALGAPI
@@ -705,6 +794,14 @@ config CRYPTO_DES
 	help
 	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).

+config CRYPTO_DES_SPARC64
+	tristate "DES and Triple DES EDE cipher algorithms (SPARC64)"
+	select CRYPTO_ALGAPI
+	select CRYPTO_DES
+	help
+	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3),
+	  optimized using SPARC64 crypto opcodes.
+
 config CRYPTO_FCRYPT
 	tristate "FCrypt cipher algorithm"
 	select CRYPTO_ALGAPI

--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -42,7 +42,7 @@ MODULE_DESCRIPTION("Niagara2 Crypto driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);

-#define N2_CRA_PRIORITY		300
+#define N2_CRA_PRIORITY		200

 static DEFINE_MUTEX(spu_lock);


--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -150,7 +150,7 @@ static long d7s_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			regs |= D7S_FLIP;
 		writeb(regs, p->regs);
 		break;
-	};
+	}
 	mutex_unlock(&d7s_mutex);

 	return error;

--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -353,7 +353,7 @@ static int envctrl_i2c_data_translate(unsigned char data, int translate_type,

 	default:
 		break;
-	};
+	}

 	return len;
 }
@@ -644,7 +644,7 @@ envctrl_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	default:
 		break;

-	};
+	}

 	return ret;
 }
@@ -687,7 +687,7 @@ envctrl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)

 	default:
 		return -EINVAL;
-	};
+	}

 	return 0;
 }
@@ -947,7 +947,7 @@ static void envctrl_init_i2c_child(struct device_node *dp,

 		default:
 			break;
-		};
+		}
 	}
 }


--- a/drivers/sbus/char/openprom.c
+++ b/drivers/sbus/char/openprom.c
@@ -222,7 +222,7 @@ static int opromnext(void __user *argp, unsigned int cmd, struct device_node *dp
 		case OPROMSETCUR:
 		default:
 			break;
-		};
+		}
 	} else {
 		/* Sibling of node zero is the root node.  */
 		if (cmd != OPROMNEXT)
@@ -588,7 +588,7 @@ static int openprom_bsd_ioctl(struct file * file,
 	default:
 		err = -EINVAL;
 		break;
-	};
+	}
 	mutex_unlock(&openprom_mutex);

 	return err;