Merge tag 'arc-4.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc

Pull ARC architecture updates from Vineet Gupta: "ARC updates for 4.3: - perf support for ARCv2 based cores (sampling interrupt, SMP) - leftovers for ARCv2 support - futex fixes" * tag 'arc-4.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc: ARCv2: entry: Fix reserved handler ARCv2: perf: Finally introduce HS perf unit ARCv2: perf: SMP support ARCv2: perf: implement exclusion of event counting in user or kernel mode ARCv2: perf: Support sampling events using overflow interrupts ARCv2: perf: implement "event_set_period" ARC: perf: cap the number of counters to hardware max of 32 ARC: Eliminate some ARCv2 specific code for ARCompact build ARC: add/fix some comments in code - no functional change ARC: change some branchs to jumps to resolve linkage errors ARC: ensure futex ops are atomic in !LLSC config ARC: Enable HAVE_FUTEX_CMPXCHG ARC: make futex_atomic_cmpxchg_inatomic() return bimodal ARC: futex cosmetics ARC: add barriers to futex code ARCv2: IOC: Allow boot time disable ARCv2: SLC: Allow boot time disable ARCv2: Support IO Coherency and permutations involving L1 and L2 caches ARC: Enable optimistic spinning for LLSC config MAINTAINERS: add git tree for the arc architecture

Merge tag 'arc-4.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc
Pull ARC architecture updates from Vineet Gupta: "ARC updates for 4.3: - perf support for ARCv2 based cores (sampling interrupt, SMP) - leftovers for ARCv2 support - futex fixes" * tag 'arc-4.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc: ARCv2: entry: Fix reserved handler ARCv2: perf: Finally introduce HS perf unit ARCv2: perf: SMP support ARCv2: perf: implement exclusion of event counting in user or kernel mode ARCv2: perf: Support sampling events using overflow interrupts ARCv2: perf: implement "event_set_period" ARC: perf: cap the number of counters to hardware max of 32 ARC: Eliminate some ARCv2 specific code for ARCompact build ARC: add/fix some comments in code - no functional change ARC: change some branchs to jumps to resolve linkage errors ARC: ensure futex ops are atomic in !LLSC config ARC: Enable HAVE_FUTEX_CMPXCHG ARC: make futex_atomic_cmpxchg_inatomic() return bimodal ARC: futex cosmetics ARC: add barriers to futex code ARCv2: IOC: Allow boot time disable ARCv2: SLC: Allow boot time disable ARCv2: Support IO Coherency and permutations involving L1 and L2 caches ARC: Enable optimistic spinning for LLSC config MAINTAINERS: add git tree for the arc architecture
28dce7c7 · Linus Torvalds · 361f7d17 · 3d592659 · 28dce7c7 · 28dce7c7
Commit 28dce7c7 authored Sep 01, 2015 by Linus Torvalds
17 changed files
--- a/Documentation/devicetree/bindings/arc/archs-pct.txt
+++ b/Documentation/devicetree/bindings/arc/archs-pct.txt
+* ARC HS Performance Counters
+The ARC HS can be configured with a pipeline performance monitor for counting
+CPU and cache events like cache misses and hits. Like conventional PCT there
+are 100+ hardware conditions dynamically mapped to upto 32 counters.
+It also supports overflow interrupts.
+Required properties:
+- compatible : should contain
+	"snps,archs-pct"
+Example:
+pmu {
+        compatible = "snps,archs-pct";
+};
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9911,8 +9911,9 @@ SYNOPSYS ARC ARCHITECTURE
 M:	Vineet Gupta <vgupta@synopsys.com>
 S:	Supported
 F:	arch/arc/
-F:	Documentation/devicetree/bindings/arc/
+F:	Documentation/devicetree/bindings/arc/*
 F:	drivers/tty/serial/arc_uart.c
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git
 SYNOPSYS ARC SDP platform support
 M:	Alexey Brodkin <abrodkin@synopsys.com>

--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -8,6 +8,7 @@
 config ARC
 	def_bool y
+	select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
 	select BUILDTIME_EXTABLE_SORT
 	select COMMON_CLK
 	select CLONE_BACKWARDS
@@ -22,6 +23,7 @@ config ARC
 	select GENERIC_SMP_IDLE_THREAD
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_FUTEX_CMPXCHG
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES

--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -72,12 +72,13 @@ arcpct0: pct {
 	};
 	/*
-	 * This INTC is actually connected to DW APB GPIO
+	 * The DW APB ICTL intc on MB is connected to CPU intc via a
-	 * which acts as a wire between MB INTC and CPU INTC.
+	 * DT "invisible" DW APB GPIO block, configured to simply pass thru
-	 * GPIO INTC is configured in platform init code
+	 * interrupts - setup accordinly in platform init (plat-axs10x/ax10x.c)
-	 * and here we mimic direct connection from MB INTC to
+	 *
-	 * CPU INTC, thus we set "interrupts = <7>" instead of
+	 * So here we mimic a direct connection betwen them, ignoring the
-	 * "interrupts = <12>"
+	 * ABPG GPIO. Thus set "interrupts = <24>" (DW APB GPIO to core)
+	 * instead of "interrupts = <12>" (DW APB ICTL to DW APB GPIO)
 	 *
 	 * This intc actually resides on MB, but we move it here to
 	 * avoid duplicating the MB dtsi file given that IRQ from

--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -35,6 +35,7 @@
 #define ARC_REG_RTT_BCR		0xF2
 #define ARC_REG_IRQ_BCR		0xF3
 #define ARC_REG_SMART_BCR	0xFF
+#define ARC_REG_CLUSTER_BCR	0xcf
 /* status32 Bits Positions */
 #define STATUS_AE_BIT		5	/* Exception active */

--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -53,6 +53,8 @@ extern void arc_cache_init(void);
 extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
 extern void read_decode_cache_bcr(void);
+extern int ioc_exists;
 #endif	/* !__ASSEMBLY__ */
 /* Instruction cache related Auxiliary registers */
@@ -94,4 +96,10 @@ extern void read_decode_cache_bcr(void);
 #define SLC_CTRL_BUSY		0x100
 #define SLC_CTRL_RGN_OP_INV	0x200
+/* IO coherency related Auxiliary registers */
+#define ARC_REG_IO_COH_ENABLE	0x500
+#define ARC_REG_IO_COH_PARTIAL	0x501
+#define ARC_REG_IO_COH_AP0_BASE	0x508
+#define ARC_REG_IO_COH_AP0_SIZE	0x509
 #endif /* _ASM_CACHE_H */
--- a/arch/arc/include/asm/cmpxchg.h
+++ b/arch/arc/include/asm/cmpxchg.h
@@ -110,18 +110,18 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
 						 sizeof(*(ptr))))
 /*
- * On ARC700, EX insn is inherently atomic, so by default "vanilla" xchg() need
+ * xchg() maps directly to ARC EX instruction which guarantees atomicity.
- * not require any locking. However there's a quirk.
+ * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock
- * ARC lacks native CMPXCHG, thus emulated (see above), using external locking -
+ * due to a subtle reason:
- * incidently it "reuses" the same atomic_ops_lock used by atomic APIs.
+ *  - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot
- * Now, llist code uses cmpxchg() and xchg() on same data, so xchg() needs to
+ *    of  kernel code which calls xchg()/cmpxchg() on same data (see llist.h)
- * abide by same serializing rules, thus ends up using atomic_ops_lock as well.
+ *    Hence xchg() needs to follow same locking rules.
 *
- * This however is only relevant if SMP and/or ARC lacks LLSC
+ * Technically the lock is also needed for UP (boils down to irq save/restore)
- *   if (UP or LLSC)
+ * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to
- *      xchg doesn't need serialization
+ * be disabled thus can't possibly be interrpted/preempted/clobbered by xchg()
- *   else <==> !(UP or LLSC) <==> (!UP and !LLSC) <==> (SMP and !LLSC)
+ * Other way around, xchg is one instruction anyways, so can't be interrupted
- *      xchg needs serialization
+ * as such
 */
 #if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP)

--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -20,6 +20,7 @@
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\
 							\
+	smp_mb();					\
 	__asm__ __volatile__(				\
 	"1:	llock	%1, [%2]		\n"	\
 		insn				"\n"	\
@@ -30,7 +31,7 @@
 	"	.section .fixup,\"ax\"		\n"	\
 	"	.align  4			\n"	\
 	"4:	mov %0, %4			\n"	\
-	"	b   3b				\n"	\
+	"	j   3b				\n"	\
 	"	.previous			\n"	\
 	"	.section __ex_table,\"a\"	\n"	\
 	"	.align  4			\n"	\
@@ -40,12 +41,14 @@
 							\
 	: "=&r" (ret), "=&r" (oldval)			\
 	: "r" (uaddr), "r" (oparg), "ir" (-EFAULT)	\
-	: "cc", "memory")
+	: "cc", "memory");				\
+	smp_mb()					\
 #else	/* !CONFIG_ARC_HAS_LLSC */
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\
 							\
+	smp_mb();					\
 	__asm__ __volatile__(				\
 	"1:	ld	%1, [%2]		\n"	\
 		insn				"\n"	\
@@ -55,7 +58,7 @@
 	"	.section .fixup,\"ax\"		\n"	\
 	"	.align  4			\n"	\
 	"4:	mov %0, %4			\n"	\
-	"	b   3b				\n"	\
+	"	j   3b				\n"	\
 	"	.previous			\n"	\
 	"	.section __ex_table,\"a\"	\n"	\
 	"	.align  4			\n"	\
@@ -65,7 +68,8 @@
 							\
 	: "=&r" (ret), "=&r" (oldval)			\
 	: "r" (uaddr), "r" (oparg), "ir" (-EFAULT)	\
-	: "cc", "memory")
+	: "cc", "memory");				\
+	smp_mb()					\
 #endif
@@ -83,6 +87,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
+#endif
 	pagefault_disable();
 	switch (op) {
@@ -90,6 +97,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 		__futex_atomic_op("mov %0, %3", ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_ADD:
+		/* oldval = *uaddr; *uaddr += oparg ; ret = *uaddr */
 		__futex_atomic_op("add %0, %1, %3", ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_OR:
@@ -106,6 +114,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 	pagefault_enable();
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_enable();
+#endif
 	if (!ret) {
 		switch (cmp) {
@@ -134,54 +145,57 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	return ret;
 }
-/* Compare-xchg with pagefaults disabled.
+/*
- *  Notes:
+ * cmpxchg of futex (pagefaults disabled by caller)
- *      -Best-Effort: Exchg happens only if compare succeeds.
+ * Return 0 for success, -EFAULT otherwise
- *          If compare fails, returns; leaving retry/looping to upper layers
- *      -successful cmp-xchg: return orig value in @addr (same as cmp val)
- *      -Compare fails: return orig value in @addr
- *      -user access r/w fails: return -EFAULT
 */
 static inline int
-futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 			      u32 newval)
 {
-	u32 val;
+	int ret = 0;
+	u32 existval;
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
-	pagefault_disable();
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
+#endif
+	smp_mb();
 	__asm__ __volatile__(
 #ifdef CONFIG_ARC_HAS_LLSC
-	"1:	llock	%0, [%3]		\n"
+	"1:	llock	%1, [%4]		\n"
-	"	brne	%0, %1, 3f		\n"
+	"	brne	%1, %2, 3f		\n"
-	"2:	scond	%2, [%3]		\n"
+	"2:	scond	%3, [%4]		\n"
 	"	bnz	1b			\n"
 #else
-	"1:	ld	%0, [%3]		\n"
+	"1:	ld	%1, [%4]		\n"
-	"	brne	%0, %1, 3f		\n"
+	"	brne	%1, %2, 3f		\n"
-	"2:	st	%2, [%3]		\n"
+	"2:	st	%3, [%4]		\n"
 #endif
 	"3:	\n"
 	"	.section .fixup,\"ax\"	\n"
-	"4:	mov %0, %4	\n"
+	"4:	mov %0, %5	\n"
-	"	b   3b	\n"
+	"	j   3b	\n"
 	"	.previous	\n"
 	"	.section __ex_table,\"a\"	\n"
 	"	.align  4	\n"
 	"	.word   1b, 4b	\n"
 	"	.word   2b, 4b	\n"
 	"	.previous\n"
-	: "=&r"(val)
+	: "+&r"(ret), "=&r"(existval)
-	: "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
+	: "r"(expval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");
-	pagefault_enable();
+	smp_mb();
-	*uval = val;
+#ifndef CONFIG_ARC_HAS_LLSC
-	return val;
+	preempt_enable();
+#endif
+	*uval = existval;
+	return ret;
 }
 #endif
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
 /*
 * Linux performance counter support for ARC
 *
+ * Copyright (C) 2014-2015 Synopsys, Inc. (www.synopsys.com)
 * Copyright (C) 2011-2013 Synopsys, Inc. (www.synopsys.com)
 *
 * This program is free software; you can redistribute it and/or modify
@@ -12,8 +13,8 @@
 #ifndef __ASM_PERF_EVENT_H
 #define __ASM_PERF_EVENT_H
-/* real maximum varies per CPU, this is the maximum supported by the driver */
+/* Max number of counters that PCT block may ever have */
-#define ARC_PMU_MAX_HWEVENTS	64
+#define ARC_PERF_MAX_COUNTERS	32
 #define ARC_REG_CC_BUILD	0xF6
 #define ARC_REG_CC_INDEX	0x240
@@ -28,15 +29,22 @@
 #define ARC_REG_PCT_CONFIG	0x254
 #define ARC_REG_PCT_CONTROL	0x255
 #define ARC_REG_PCT_INDEX	0x256
+#define ARC_REG_PCT_INT_CNTL	0x25C
+#define ARC_REG_PCT_INT_CNTH	0x25D
+#define ARC_REG_PCT_INT_CTRL	0x25E
+#define ARC_REG_PCT_INT_ACT	0x25F
+#define ARC_REG_PCT_CONFIG_USER	(1 << 18)	/* count in user mode */
+#define ARC_REG_PCT_CONFIG_KERN	(1 << 19)	/* count in kernel mode */
 #define ARC_REG_PCT_CONTROL_CC	(1 << 16)	/* clear counts */
 #define ARC_REG_PCT_CONTROL_SN	(1 << 17)	/* snapshot */
 struct arc_reg_pct_build {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-	unsigned int m:8, c:8, r:6, s:2, v:8;
+	unsigned int m:8, c:8, r:5, i:1, s:2, v:8;
 #else
-	unsigned int v:8, s:2, r:6, c:8, m:8;
+	unsigned int v:8, s:2, i:1, r:5, c:8, m:8;
 #endif
 };
@@ -95,10 +103,13 @@ static const char * const arc_pmu_ev_hw_map[] = {
 	/* counts condition */
 	[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp",
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */
 	[PERF_COUNT_ARC_BPOK]         = "bpok",	  /* NP-NT, PT-T, PNT-NT */
+#ifdef CONFIG_ISA_ARCV2
+	[PERF_COUNT_HW_BRANCH_MISSES] = "bpmp",
+#else
 	[PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */
+#endif
 	[PERF_COUNT_ARC_LDC] = "imemrdc",	/* Instr: mem read cached */
 	[PERF_COUNT_ARC_STC] = "imemwrc",	/* Instr: mem write cached */

--- a/arch/arc/kernel/entry-arcv2.S
+++ b/arch/arc/kernel/entry-arcv2.S
@@ -57,13 +57,8 @@ VECTOR	handle_interrupt	; (23) End of fixed IRQs
 	.section .text, "ax",@progbits
-res_service:		; processor restart
+reserved:
-	flag    0x1     ; not implemented
+	flag 1		; Unexpected event, halt
-	nop
-	nop
-reserved:		; processor restart
-	rtie            ; jump to processor initializations
 ;##################### Interrupt Handling ##############################

--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@@ -42,7 +42,7 @@ ENTRY(ret_from_fork)
 	; when the forked child comes here from the __switch_to function
 	; r0 has the last task pointer.
 	; put last task in scheduler queue
-	bl   @schedule_tail
+	jl   @schedule_tail
 	ld   r9, [sp, PT_status32]
 	brne r9, 0, 1f
@@ -320,7 +320,7 @@ resume_user_mode_begin:
 	; --- (Slow Path #1) task preemption ---
 	bbit0  r9, TIF_NEED_RESCHED, .Lchk_pend_signals
 	mov    blink, resume_user_mode_begin  ; tail-call to U mode ret chks
-	b      @schedule 	; BTST+Bnz causes relo error in link
+	j      @schedule 	; BTST+Bnz causes relo error in link
 .Lchk_pend_signals:
 	IRQ_ENABLE	r10
@@ -381,7 +381,7 @@ resume_kernel_mode:
 	bbit0  r9, TIF_NEED_RESCHED, .Lrestore_regs
 	; Invoke PREEMPTION
-	bl      preempt_schedule_irq
+	jl      preempt_schedule_irq
 	; preempt_schedule_irq() always returns with IRQ disabled
 #endif

--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -65,7 +65,7 @@ asmlinkage void ret_from_fork(void);
 * ------------------
 * |     r25        |   <==== top of Stack (thread.ksp)
 * ~                ~
- * |    --to--      |   (CALLEE Regs of user mode)
+ * |    --to--      |   (CALLEE Regs of kernel mode)
 * |     r13        |
 * ------------------
 * |     fp         |

--- a/arch/arc/kernel/unaligned.c
+++ b/arch/arc/kernel/unaligned.c
@@ -34,7 +34,7 @@
 	"	.section .fixup,\"ax\"\n"		\
 	"	.align	4\n"				\
 	"3:	mov	%0, 1\n"			\
-	"	b	2b\n"				\
+	"	j	2b\n"				\
 	"	.previous\n"				\
 	"	.section __ex_table,\"a\"\n"		\
 	"	.align	4\n"				\
@@ -82,7 +82,7 @@
 		"	.section .fixup,\"ax\"\n"	\
 		"	.align	4\n"			\
 		"4:	mov	%0, 1\n"		\
-		"	b	3b\n"			\
+		"	j	3b\n"			\
 		"	.previous\n"			\
 		"	.section __ex_table,\"a\"\n"	\
 		"	.align	4\n"			\
@@ -113,7 +113,7 @@
 		"	.section .fixup,\"ax\"\n"	\
 		"	.align	4\n"			\
 		"6:	mov	%0, 1\n"		\
-		"	b	5b\n"			\
+		"	j	5b\n"			\
 		"	.previous\n"			\
 		"	.section __ex_table,\"a\"\n"	\
 		"	.align	4\n"			\

--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -22,15 +22,22 @@
 #include <asm/setup.h>
 static int l2_line_sz;
+int ioc_exists;
+volatile int slc_enable = 1, ioc_enable = 1;
 void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
 			       unsigned long sz, const int cacheop);
+void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz);
+void (*__dma_cache_inv)(unsigned long start, unsigned long sz);
+void (*__dma_cache_wback)(unsigned long start, unsigned long sz);
 char *arc_cache_mumbojumbo(int c, char *buf, int len)
 {
 	int n = 0;
 	struct cpuinfo_arc_cache *p;
+#define IS_USED_RUN(v)		((v) ? "" : "(disabled) ")
 #define PR_CACHE(p, cfg, str)						\
 	if (!(p)->ver)							\
 		n += scnprintf(buf + n, len - n, str"\t\t: N/A\n");	\
@@ -45,10 +52,18 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 	PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache");
 	PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache");
+	if (!is_isa_arcv2())
+                return buf;
 	p = &cpuinfo_arc700[c].slc;
 	if (p->ver)
 		n += scnprintf(buf + n, len - n,
-			"SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len);
+			       "SLC\t\t: %uK, %uB Line%s\n",
+			       p->sz_k, p->line_len, IS_USED_RUN(slc_enable));
+	if (ioc_exists)
+		n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n",
+				IS_USED_RUN(ioc_enable));
 	return buf;
 }
@@ -58,18 +73,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 * the cpuinfo structure for later use.
 * No Validation done here, simply read/convert the BCRs
 */
-void read_decode_cache_bcr(void)
+static void read_decode_cache_bcr_arcv2(int cpu)
 {
-	struct cpuinfo_arc_cache *p_ic, *p_dc, *p_slc;
+	struct cpuinfo_arc_cache *p_slc = &cpuinfo_arc700[cpu].slc;
-	unsigned int cpu = smp_processor_id();
-	struct bcr_cache {
-#ifdef CONFIG_CPU_BIG_ENDIAN
-		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
-#else
-		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
-#endif
-	} ibcr, dbcr;
 	struct bcr_generic sbcr;
 	struct bcr_slc_cfg {
@@ -80,6 +86,39 @@ void read_decode_cache_bcr(void)
 #endif
 	} slc_cfg;
+	struct bcr_clust_cfg {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:7, c:1, num_entries:8, num_cores:8, ver:8;
+#else
+		unsigned int ver:8, num_cores:8, num_entries:8, c:1, pad:7;
+#endif
+	} cbcr;
+	READ_BCR(ARC_REG_SLC_BCR, sbcr);
+	if (sbcr.ver) {
+		READ_BCR(ARC_REG_SLC_CFG, slc_cfg);
+		p_slc->ver = sbcr.ver;
+		p_slc->sz_k = 128 << slc_cfg.sz;
+		l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
+	}
+	READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
+	if (cbcr.c && ioc_enable)
+		ioc_exists = 1;
+}
+void read_decode_cache_bcr(void)
+{
+	struct cpuinfo_arc_cache *p_ic, *p_dc;
+	unsigned int cpu = smp_processor_id();
+	struct bcr_cache {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
+#else
+		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
+#endif
+	} ibcr, dbcr;
 	p_ic = &cpuinfo_arc700[cpu].icache;
 	READ_BCR(ARC_REG_IC_BCR, ibcr);
@@ -122,17 +161,8 @@ void read_decode_cache_bcr(void)
 	p_dc->ver = dbcr.ver;
 slc_chk:
-	if (!is_isa_arcv2())
+	if (is_isa_arcv2())
-		return;
+                read_decode_cache_bcr_arcv2(cpu);
-	p_slc = &cpuinfo_arc700[cpu].slc;
-	READ_BCR(ARC_REG_SLC_BCR, sbcr);
-	if (sbcr.ver) {
-		READ_BCR(ARC_REG_SLC_CFG, slc_cfg);
-		p_slc->ver = sbcr.ver;
-		p_slc->sz_k = 128 << slc_cfg.sz;
-		l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
-	}
 }
 /*
@@ -516,11 +546,6 @@ noinline void slc_op(unsigned long paddr, unsigned long sz, const int op)
 #endif
 }
-static inline int need_slc_flush(void)
-{
-	return is_isa_arcv2() && l2_line_sz;
-}
 /***********************************************************
 * Exported APIs
 */
@@ -569,31 +594,75 @@ void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
-void dma_cache_wback_inv(unsigned long start, unsigned long sz)
+/*
+ * DMA ops for systems with L1 cache only
+ * Make memory coherent with L1 cache by flushing/invalidating L1 lines
+ */
+static void __dma_cache_wback_inv_l1(unsigned long start, unsigned long sz)
 {
 	__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
+}
+static void __dma_cache_inv_l1(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_INV);
+}
+static void __dma_cache_wback_l1(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH);
+}
-	if (need_slc_flush())
+/*
+ * DMA ops for systems with both L1 and L2 caches, but without IOC
+ * Both L1 and L2 lines need to be explicity flushed/invalidated
+ */
+static void __dma_cache_wback_inv_slc(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
 	slc_op(start, sz, OP_FLUSH_N_INV);
 }
-EXPORT_SYMBOL(dma_cache_wback_inv);
-void dma_cache_inv(unsigned long start, unsigned long sz)
+static void __dma_cache_inv_slc(unsigned long start, unsigned long sz)
 {
 	__dc_line_op_k(start, sz, OP_INV);
-	if (need_slc_flush())
 	slc_op(start, sz, OP_INV);
 }
-EXPORT_SYMBOL(dma_cache_inv);
-void dma_cache_wback(unsigned long start, unsigned long sz)
+static void __dma_cache_wback_slc(unsigned long start, unsigned long sz)
 {
 	__dc_line_op_k(start, sz, OP_FLUSH);
-	if (need_slc_flush())
 	slc_op(start, sz, OP_FLUSH);
 }
+/*
+ * DMA ops for systems with IOC
+ * IOC hardware snoops all DMA traffic keeping the caches consistent with
+ * memory - eliding need for any explicit cache maintenance of DMA buffers
+ */
+static void __dma_cache_wback_inv_ioc(unsigned long start, unsigned long sz) {}
+static void __dma_cache_inv_ioc(unsigned long start, unsigned long sz) {}
+static void __dma_cache_wback_ioc(unsigned long start, unsigned long sz) {}
+/*
+ * Exported DMA API
+ */
+void dma_cache_wback_inv(unsigned long start, unsigned long sz)
+{
+	__dma_cache_wback_inv(start, sz);
+}
+EXPORT_SYMBOL(dma_cache_wback_inv);
+void dma_cache_inv(unsigned long start, unsigned long sz)
+{
+	__dma_cache_inv(start, sz);
+}
+EXPORT_SYMBOL(dma_cache_inv);
+void dma_cache_wback(unsigned long start, unsigned long sz)
+{
+	__dma_cache_wback(start, sz);
+}
 EXPORT_SYMBOL(dma_cache_wback);
 /*
@@ -848,4 +917,41 @@ void arc_cache_init(void)
 				panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
 		}
 	}
+	if (is_isa_arcv2() && l2_line_sz && !slc_enable) {
+		/* IM set : flush before invalidate */
+		write_aux_reg(ARC_REG_SLC_CTRL,
+			read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_IM);
+		write_aux_reg(ARC_REG_SLC_INVALIDATE, 1);
+		/* Important to wait for flush to complete */
+		while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY);
+		write_aux_reg(ARC_REG_SLC_CTRL,
+			read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_DISABLE);
+	}
+	if (is_isa_arcv2() && ioc_exists) {
+		/* IO coherency base - 0x8z */
+		write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000);
+		/* IO coherency aperture size - 512Mb: 0x8z-0xAz */
+		write_aux_reg(ARC_REG_IO_COH_AP0_SIZE, 0x11);
+		/* Enable partial writes */
+		write_aux_reg(ARC_REG_IO_COH_PARTIAL, 1);
+		/* Enable IO coherency */
+		write_aux_reg(ARC_REG_IO_COH_ENABLE, 1);
+		__dma_cache_wback_inv = __dma_cache_wback_inv_ioc;
+		__dma_cache_inv = __dma_cache_inv_ioc;
+		__dma_cache_wback = __dma_cache_wback_ioc;
+	} else if (is_isa_arcv2() && l2_line_sz && slc_enable) {
+		__dma_cache_wback_inv = __dma_cache_wback_inv_slc;
+		__dma_cache_inv = __dma_cache_inv_slc;
+		__dma_cache_wback = __dma_cache_wback_slc;
+	} else {
+		__dma_cache_wback_inv = __dma_cache_wback_inv_l1;
+		__dma_cache_inv = __dma_cache_inv_l1;
+		__dma_cache_wback = __dma_cache_wback_l1;
+	}
 }
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -19,6 +19,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/dma-debug.h>
 #include <linux/export.h>
+#include <asm/cache.h>
 #include <asm/cacheflush.h>
 /*
@@ -53,6 +54,20 @@ void *dma_alloc_coherent(struct device *dev, size_t size,
 {
 	void *paddr, *kvaddr;
+	/*
+	 * IOC relies on all data (even coherent DMA data) being in cache
+	 * Thus allocate normal cached memory
+	 *
+	 * The gains with IOC are two pronged:
+	 *   -For streaming data, elides needs for cache maintenance, saving
+	 *    cycles in flush code, and bus bandwidth as all the lines of a
+	 *    buffer need to be flushed out to memory
+	 *   -For coherent data, Read/Write to buffers terminate early in cache
+	 *   (vs. always going to memory - thus are faster)
+	 */
+	if (is_isa_arcv2() && ioc_exists)
+		return dma_alloc_noncoherent(dev, size, dma_handle, gfp);
 	/* This is linear addr (0x8000_0000 based) */
 	paddr = alloc_pages_exact(size, gfp);
 	if (!paddr)
@@ -85,6 +100,9 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size, void *kvaddr,
 		       dma_addr_t dma_handle)
 {
+	if (is_isa_arcv2() && ioc_exists)
+		return dma_free_noncoherent(dev, size, kvaddr, dma_handle);
 	iounmap((void __force __iomem *)kvaddr);
 	free_pages_exact((void *)dma_handle, size);

--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -46,7 +46,7 @@ static void __init axs10x_enable_gpio_intc_wire(void)
 	 * -------------------   -------------------
 	 * | snps,dw-apb-gpio |  | snps,dw-apb-gpio |
 	 * -------------------   -------------------
-	 *        |                         |
+	 *        | #12                     |
 	 *        |                 [ Debug UART on cpu card ]
 	 *        |
 	 * ------------------------