Commit df762ecc authored by Russell King's avatar Russell King

Merge branch 'devel-stable' into for-next

Conflicts:
	arch/arm/include/asm/atomic.h
	arch/arm/include/asm/hardirq.h
	arch/arm/kernel/smp.c
parents ec1e20a0 70d42126
...@@ -52,6 +52,8 @@ config ARM ...@@ -52,6 +52,8 @@ config ARM
select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
select HAVE_OPROFILE if (HAVE_PERF_EVENTS) select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_SYSCALL_TRACEPOINTS select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UID16 select HAVE_UID16
...@@ -482,6 +484,7 @@ config ARCH_IXP4XX ...@@ -482,6 +484,7 @@ config ARCH_IXP4XX
bool "IXP4xx-based" bool "IXP4xx-based"
depends on MMU depends on MMU
select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_DMA_SET_COHERENT_MASK
select ARCH_SUPPORTS_BIG_ENDIAN
select ARCH_REQUIRE_GPIOLIB select ARCH_REQUIRE_GPIOLIB
select CLKSRC_MMIO select CLKSRC_MMIO
select CPU_XSCALE select CPU_XSCALE
...@@ -1545,6 +1548,32 @@ config MCPM ...@@ -1545,6 +1548,32 @@ config MCPM
for (multi-)cluster based systems, such as big.LITTLE based for (multi-)cluster based systems, such as big.LITTLE based
systems. systems.
config BIG_LITTLE
bool "big.LITTLE support (Experimental)"
depends on CPU_V7 && SMP
select MCPM
help
This option enables support selections for the big.LITTLE
system architecture.
config BL_SWITCHER
bool "big.LITTLE switcher support"
depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
select CPU_PM
select ARM_CPU_SUSPEND
help
The big.LITTLE "switcher" provides the core functionality to
transparently handle transition between a cluster of A15's
and a cluster of A7's in a big.LITTLE system.
config BL_SWITCHER_DUMMY_IF
tristate "Simple big.LITTLE switcher user interface"
depends on BL_SWITCHER && DEBUG_KERNEL
help
This is a simple and dummy char dev interface to control
the big.LITTLE switcher core code. It is meant for
debugging purposes only.
choice choice
prompt "Memory split" prompt "Memory split"
default VMSPLIT_3G default VMSPLIT_3G
......
...@@ -16,6 +16,7 @@ LDFLAGS := ...@@ -16,6 +16,7 @@ LDFLAGS :=
LDFLAGS_vmlinux :=-p --no-undefined -X LDFLAGS_vmlinux :=-p --no-undefined -X
ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
LDFLAGS_vmlinux += --be8 LDFLAGS_vmlinux += --be8
LDFLAGS_MODULE += --be8
endif endif
OBJCOPYFLAGS :=-O binary -R .comment -S OBJCOPYFLAGS :=-O binary -R .comment -S
......
...@@ -135,6 +135,7 @@ start: ...@@ -135,6 +135,7 @@ start:
.word _edata @ zImage end address .word _edata @ zImage end address
THUMB( .thumb ) THUMB( .thumb )
1: 1:
ARM_BE8( setend be ) @ go BE8 if compiled for BE8
mrs r9, cpsr mrs r9, cpsr
#ifdef CONFIG_ARM_VIRT_EXT #ifdef CONFIG_ARM_VIRT_EXT
bl __hyp_stub_install @ get into SVC mode, reversibly bl __hyp_stub_install @ get into SVC mode, reversibly
...@@ -699,9 +700,7 @@ __armv4_mmu_cache_on: ...@@ -699,9 +700,7 @@ __armv4_mmu_cache_on:
mrc p15, 0, r0, c1, c0, 0 @ read control reg mrc p15, 0, r0, c1, c0, 0 @ read control reg
orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement
orr r0, r0, #0x0030 orr r0, r0, #0x0030
#ifdef CONFIG_CPU_ENDIAN_BE8 ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables
orr r0, r0, #1 << 25 @ big-endian page tables
#endif
bl __common_mmu_cache_on bl __common_mmu_cache_on
mov r0, #0 mov r0, #0
mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
...@@ -728,9 +727,7 @@ __armv7_mmu_cache_on: ...@@ -728,9 +727,7 @@ __armv7_mmu_cache_on:
orr r0, r0, #1 << 22 @ U (v6 unaligned access model) orr r0, r0, #1 << 22 @ U (v6 unaligned access model)
@ (needed for ARM1176) @ (needed for ARM1176)
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
#ifdef CONFIG_CPU_ENDIAN_BE8 ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables
orr r0, r0, #1 << 25 @ big-endian page tables
#endif
mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg
orrne r0, r0, #1 @ MMU enabled orrne r0, r0, #1 @ MMU enabled
movne r1, #0xfffffffd @ domain 0 = client movne r1, #0xfffffffd @ domain 0 = client
......
...@@ -17,3 +17,5 @@ obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o ...@@ -17,3 +17,5 @@ obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
AFLAGS_mcpm_head.o := -march=armv7-a AFLAGS_mcpm_head.o := -march=armv7-a
AFLAGS_vlock.o := -march=armv7-a AFLAGS_vlock.o := -march=armv7-a
obj-$(CONFIG_TI_PRIV_EDMA) += edma.o obj-$(CONFIG_TI_PRIV_EDMA) += edma.o
obj-$(CONFIG_BL_SWITCHER) += bL_switcher.o
obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
/*
* arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
*
* Created by: Nicolas Pitre, March 2012
* Copyright: (C) 2012-2013 Linaro Limited
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/atomic.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/cpu_pm.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/time.h>
#include <linux/clockchips.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
#include <linux/notifier.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/smp.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/irqchip/arm-gic.h>
#include <linux/moduleparam.h>
#include <asm/smp_plat.h>
#include <asm/cputype.h>
#include <asm/suspend.h>
#include <asm/mcpm.h>
#include <asm/bL_switcher.h>
#define CREATE_TRACE_POINTS
#include <trace/events/power_cpu_migrate.h>
/*
* Use our own MPIDR accessors as the generic ones in asm/cputype.h have
* __attribute_const__ and we don't want the compiler to assume any
* constness here as the value _does_ change along some code paths.
*/
static int read_mpidr(void)
{
unsigned int id;
asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id));
return id & MPIDR_HWID_BITMASK;
}
/*
* Get a global nanosecond time stamp for tracing.
*/
static s64 get_ns(void)
{
struct timespec ts;
getnstimeofday(&ts);
return timespec_to_ns(&ts);
}
/*
* bL switcher core code.
*/
static void bL_do_switch(void *_arg)
{
unsigned ib_mpidr, ib_cpu, ib_cluster;
long volatile handshake, **handshake_ptr = _arg;
pr_debug("%s\n", __func__);
ib_mpidr = cpu_logical_map(smp_processor_id());
ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
/* Advertise our handshake location */
if (handshake_ptr) {
handshake = 0;
*handshake_ptr = &handshake;
} else
handshake = -1;
/*
* Our state has been saved at this point. Let's release our
* inbound CPU.
*/
mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
sev();
/*
* From this point, we must assume that our counterpart CPU might
* have taken over in its parallel world already, as if execution
* just returned from cpu_suspend(). It is therefore important to
* be very careful not to make any change the other guy is not
* expecting. This is why we need stack isolation.
*
* Fancy under cover tasks could be performed here. For now
* we have none.
*/
/*
* Let's wait until our inbound is alive.
*/
while (!handshake) {
wfe();
smp_mb();
}
/* Let's put ourself down. */
mcpm_cpu_power_down();
/* should never get here */
BUG();
}
/*
* Stack isolation. To ensure 'current' remains valid, we just use another
* piece of our thread's stack space which should be fairly lightly used.
* The selected area starts just above the thread_info structure located
* at the very bottom of the stack, aligned to a cache line, and indexed
* with the cluster number.
*/
#define STACK_SIZE 512
extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
static int bL_switchpoint(unsigned long _arg)
{
unsigned int mpidr = read_mpidr();
unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
void *stack = current_thread_info() + 1;
stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
stack += clusterid * STACK_SIZE + STACK_SIZE;
call_with_stack(bL_do_switch, (void *)_arg, stack);
BUG();
}
/*
* Generic switcher interface
*/
static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
static int bL_switcher_cpu_pairing[NR_CPUS];
/*
* bL_switch_to - Switch to a specific cluster for the current CPU
* @new_cluster_id: the ID of the cluster to switch to.
*
* This function must be called on the CPU to be switched.
* Returns 0 on success, else a negative status code.
*/
static int bL_switch_to(unsigned int new_cluster_id)
{
unsigned int mpidr, this_cpu, that_cpu;
unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
struct completion inbound_alive;
struct tick_device *tdev;
enum clock_event_mode tdev_mode;
long volatile *handshake_ptr;
int ipi_nr, ret;
this_cpu = smp_processor_id();
ob_mpidr = read_mpidr();
ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
if (new_cluster_id == ob_cluster)
return 0;
that_cpu = bL_switcher_cpu_pairing[this_cpu];
ib_mpidr = cpu_logical_map(that_cpu);
ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
this_cpu, ob_mpidr, ib_mpidr);
this_cpu = smp_processor_id();
/* Close the gate for our entry vectors */
mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
/* Install our "inbound alive" notifier. */
init_completion(&inbound_alive);
ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
/*
* Let's wake up the inbound CPU now in case it requires some delay
* to come online, but leave it gated in our entry vector code.
*/
ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
if (ret) {
pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
return ret;
}
/*
* Raise a SGI on the inbound CPU to make sure it doesn't stall
* in a possible WFI, such as in bL_power_down().
*/
gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
/*
* Wait for the inbound to come up. This allows for other
* tasks to be scheduled in the mean time.
*/
wait_for_completion(&inbound_alive);
mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
/*
* From this point we are entering the switch critical zone
* and can't take any interrupts anymore.
*/
local_irq_disable();
local_fiq_disable();
trace_cpu_migrate_begin(get_ns(), ob_mpidr);
/* redirect GIC's SGIs to our counterpart */
gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
tdev = tick_get_device(this_cpu);
if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
tdev = NULL;
if (tdev) {
tdev_mode = tdev->evtdev->mode;
clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
}
ret = cpu_pm_enter();
/* we can not tolerate errors at this point */
if (ret)
panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
/* Swap the physical CPUs in the logical map for this logical CPU. */
cpu_logical_map(this_cpu) = ib_mpidr;
cpu_logical_map(that_cpu) = ob_mpidr;
/* Let's do the actual CPU switch. */
ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
if (ret > 0)
panic("%s: cpu_suspend() returned %d\n", __func__, ret);
/* We are executing on the inbound CPU at this point */
mpidr = read_mpidr();
pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
BUG_ON(mpidr != ib_mpidr);
mcpm_cpu_powered_up();
ret = cpu_pm_exit();
if (tdev) {
clockevents_set_mode(tdev->evtdev, tdev_mode);
clockevents_program_event(tdev->evtdev,
tdev->evtdev->next_event, 1);
}
trace_cpu_migrate_finish(get_ns(), ib_mpidr);
local_fiq_enable();
local_irq_enable();
*handshake_ptr = 1;
dsb_sev();
if (ret)
pr_err("%s exiting with error %d\n", __func__, ret);
return ret;
}
struct bL_thread {
spinlock_t lock;
struct task_struct *task;
wait_queue_head_t wq;
int wanted_cluster;
struct completion started;
bL_switch_completion_handler completer;
void *completer_cookie;
};
static struct bL_thread bL_threads[NR_CPUS];
static int bL_switcher_thread(void *arg)
{
struct bL_thread *t = arg;
struct sched_param param = { .sched_priority = 1 };
int cluster;
bL_switch_completion_handler completer;
void *completer_cookie;
sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
complete(&t->started);
do {
if (signal_pending(current))
flush_signals(current);
wait_event_interruptible(t->wq,
t->wanted_cluster != -1 ||
kthread_should_stop());
spin_lock(&t->lock);
cluster = t->wanted_cluster;
completer = t->completer;
completer_cookie = t->completer_cookie;
t->wanted_cluster = -1;
t->completer = NULL;
spin_unlock(&t->lock);
if (cluster != -1) {
bL_switch_to(cluster);
if (completer)
completer(completer_cookie);
}
} while (!kthread_should_stop());
return 0;
}
static struct task_struct *bL_switcher_thread_create(int cpu, void *arg)
{
struct task_struct *task;
task = kthread_create_on_node(bL_switcher_thread, arg,
cpu_to_node(cpu), "kswitcher_%d", cpu);
if (!IS_ERR(task)) {
kthread_bind(task, cpu);
wake_up_process(task);
} else
pr_err("%s failed for CPU %d\n", __func__, cpu);
return task;
}
/*
* bL_switch_request_cb - Switch to a specific cluster for the given CPU,
* with completion notification via a callback
*
* @cpu: the CPU to switch
* @new_cluster_id: the ID of the cluster to switch to.
* @completer: switch completion callback. if non-NULL,
* @completer(@completer_cookie) will be called on completion of
* the switch, in non-atomic context.
* @completer_cookie: opaque context argument for @completer.
*
* This function causes a cluster switch on the given CPU by waking up
* the appropriate switcher thread. This function may or may not return
* before the switch has occurred.
*
* If a @completer callback function is supplied, it will be called when
* the switch is complete. This can be used to determine asynchronously
* when the switch is complete, regardless of when bL_switch_request()
* returns. When @completer is supplied, no new switch request is permitted
* for the affected CPU until after the switch is complete, and @completer
* has returned.
*/
int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
bL_switch_completion_handler completer,
void *completer_cookie)
{
struct bL_thread *t;
if (cpu >= ARRAY_SIZE(bL_threads)) {
pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
return -EINVAL;
}
t = &bL_threads[cpu];
if (IS_ERR(t->task))
return PTR_ERR(t->task);
if (!t->task)
return -ESRCH;
spin_lock(&t->lock);
if (t->completer) {
spin_unlock(&t->lock);
return -EBUSY;
}
t->completer = completer;
t->completer_cookie = completer_cookie;
t->wanted_cluster = new_cluster_id;
spin_unlock(&t->lock);
wake_up(&t->wq);
return 0;
}
EXPORT_SYMBOL_GPL(bL_switch_request_cb);
/*
* Activation and configuration code.
*/
static DEFINE_MUTEX(bL_switcher_activation_lock);
static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
static unsigned int bL_switcher_active;
static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
static cpumask_t bL_switcher_removed_logical_cpus;
int bL_switcher_register_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&bL_activation_notifier, nb);
}
EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
int bL_switcher_unregister_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
}
EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
static int bL_activation_notify(unsigned long val)
{
int ret;
ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
if (ret & NOTIFY_STOP_MASK)
pr_err("%s: notifier chain failed with status 0x%x\n",
__func__, ret);
return notifier_to_errno(ret);
}
static void bL_switcher_restore_cpus(void)
{
int i;
for_each_cpu(i, &bL_switcher_removed_logical_cpus)
cpu_up(i);
}
static int bL_switcher_halve_cpus(void)
{
int i, j, cluster_0, gic_id, ret;
unsigned int cpu, cluster, mask;
cpumask_t available_cpus;
/* First pass to validate what we have */
mask = 0;
for_each_online_cpu(i) {
cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
if (cluster >= 2) {
pr_err("%s: only dual cluster systems are supported\n", __func__);
return -EINVAL;
}
if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
return -EINVAL;
mask |= (1 << cluster);
}
if (mask != 3) {
pr_err("%s: no CPU pairing possible\n", __func__);
return -EINVAL;
}
/*
* Now let's do the pairing. We match each CPU with another CPU
* from a different cluster. To get a uniform scheduling behavior
* without fiddling with CPU topology and compute capacity data,
* we'll use logical CPUs initially belonging to the same cluster.
*/
memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
cpumask_copy(&available_cpus, cpu_online_mask);
cluster_0 = -1;
for_each_cpu(i, &available_cpus) {
int match = -1;
cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
if (cluster_0 == -1)
cluster_0 = cluster;
if (cluster != cluster_0)
continue;
cpumask_clear_cpu(i, &available_cpus);
for_each_cpu(j, &available_cpus) {
cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
/*
* Let's remember the last match to create "odd"
* pairings on purpose in order for other code not
* to assume any relation between physical and
* logical CPU numbers.
*/
if (cluster != cluster_0)
match = j;
}
if (match != -1) {
bL_switcher_cpu_pairing[i] = match;
cpumask_clear_cpu(match, &available_cpus);
pr_info("CPU%d paired with CPU%d\n", i, match);
}
}
/*
* Now we disable the unwanted CPUs i.e. everything that has no
* pairing information (that includes the pairing counterparts).
*/
cpumask_clear(&bL_switcher_removed_logical_cpus);
for_each_online_cpu(i) {
cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
/* Let's take note of the GIC ID for this CPU */
gic_id = gic_get_cpu_id(i);
if (gic_id < 0) {
pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
bL_switcher_restore_cpus();
return -EINVAL;
}
bL_gic_id[cpu][cluster] = gic_id;
pr_info("GIC ID for CPU %u cluster %u is %u\n",
cpu, cluster, gic_id);
if (bL_switcher_cpu_pairing[i] != -1) {
bL_switcher_cpu_original_cluster[i] = cluster;
continue;
}
ret = cpu_down(i);
if (ret) {
bL_switcher_restore_cpus();
return ret;
}
cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
}
return 0;
}
/* Determine the logical CPU a given physical CPU is grouped on. */
int bL_switcher_get_logical_index(u32 mpidr)
{
int cpu;
if (!bL_switcher_active)
return -EUNATCH;
mpidr &= MPIDR_HWID_BITMASK;
for_each_online_cpu(cpu) {
int pairing = bL_switcher_cpu_pairing[cpu];
if (pairing == -1)
continue;
if ((mpidr == cpu_logical_map(cpu)) ||
(mpidr == cpu_logical_map(pairing)))
return cpu;
}
return -EINVAL;
}
static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
{
trace_cpu_migrate_current(get_ns(), read_mpidr());
}
int bL_switcher_trace_trigger(void)
{
int ret;
preempt_disable();
bL_switcher_trace_trigger_cpu(NULL);
ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
static int bL_switcher_enable(void)
{
int cpu, ret;
mutex_lock(&bL_switcher_activation_lock);
lock_device_hotplug();
if (bL_switcher_active) {
unlock_device_hotplug();
mutex_unlock(&bL_switcher_activation_lock);
return 0;
}
pr_info("big.LITTLE switcher initializing\n");
ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
if (ret)
goto error;
ret = bL_switcher_halve_cpus();
if (ret)
goto error;
bL_switcher_trace_trigger();
for_each_online_cpu(cpu) {
struct bL_thread *t = &bL_threads[cpu];
spin_lock_init(&t->lock);
init_waitqueue_head(&t->wq);
init_completion(&t->started);
t->wanted_cluster = -1;
t->task = bL_switcher_thread_create(cpu, t);
}
bL_switcher_active = 1;
bL_activation_notify(BL_NOTIFY_POST_ENABLE);
pr_info("big.LITTLE switcher initialized\n");
goto out;
error:
pr_warn("big.LITTLE switcher initialization failed\n");
bL_activation_notify(BL_NOTIFY_POST_DISABLE);
out:
unlock_device_hotplug();
mutex_unlock(&bL_switcher_activation_lock);
return ret;
}
#ifdef CONFIG_SYSFS
static void bL_switcher_disable(void)
{
unsigned int cpu, cluster;
struct bL_thread *t;
struct task_struct *task;
mutex_lock(&bL_switcher_activation_lock);
lock_device_hotplug();
if (!bL_switcher_active)
goto out;
if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
bL_activation_notify(BL_NOTIFY_POST_ENABLE);
goto out;
}
bL_switcher_active = 0;
/*
* To deactivate the switcher, we must shut down the switcher
* threads to prevent any other requests from being accepted.
* Then, if the final cluster for given logical CPU is not the
* same as the original one, we'll recreate a switcher thread
* just for the purpose of switching the CPU back without any
* possibility for interference from external requests.
*/
for_each_online_cpu(cpu) {
t = &bL_threads[cpu];
task = t->task;
t->task = NULL;
if (!task || IS_ERR(task))
continue;
kthread_stop(task);
/* no more switch may happen on this CPU at this point */
cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
if (cluster == bL_switcher_cpu_original_cluster[cpu])
continue;
init_completion(&t->started);
t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
task = bL_switcher_thread_create(cpu, t);
if (!IS_ERR(task)) {
wait_for_completion(&t->started);
kthread_stop(task);
cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
if (cluster == bL_switcher_cpu_original_cluster[cpu])
continue;
}
/* If execution gets here, we're in trouble. */
pr_crit("%s: unable to restore original cluster for CPU %d\n",
__func__, cpu);
pr_crit("%s: CPU %d can't be restored\n",
__func__, bL_switcher_cpu_pairing[cpu]);
cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
&bL_switcher_removed_logical_cpus);
}
bL_switcher_restore_cpus();
bL_switcher_trace_trigger();
bL_activation_notify(BL_NOTIFY_POST_DISABLE);
out:
unlock_device_hotplug();
mutex_unlock(&bL_switcher_activation_lock);
}
static ssize_t bL_switcher_active_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%u\n", bL_switcher_active);
}
static ssize_t bL_switcher_active_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
int ret;
switch (buf[0]) {
case '0':
bL_switcher_disable();
ret = 0;
break;
case '1':
ret = bL_switcher_enable();
break;
default:
ret = -EINVAL;
}
return (ret >= 0) ? count : ret;
}
static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
int ret = bL_switcher_trace_trigger();
return ret ? ret : count;
}
static struct kobj_attribute bL_switcher_active_attr =
__ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
static struct kobj_attribute bL_switcher_trace_trigger_attr =
__ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
static struct attribute *bL_switcher_attrs[] = {
&bL_switcher_active_attr.attr,
&bL_switcher_trace_trigger_attr.attr,
NULL,
};
static struct attribute_group bL_switcher_attr_group = {
.attrs = bL_switcher_attrs,
};
static struct kobject *bL_switcher_kobj;
static int __init bL_switcher_sysfs_init(void)
{
int ret;
bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
if (!bL_switcher_kobj)
return -ENOMEM;
ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
if (ret)
kobject_put(bL_switcher_kobj);
return ret;
}
#endif /* CONFIG_SYSFS */
bool bL_switcher_get_enabled(void)
{
mutex_lock(&bL_switcher_activation_lock);
return bL_switcher_active;
}
EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
void bL_switcher_put_enabled(void)
{
mutex_unlock(&bL_switcher_activation_lock);
}
EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
/*
* Veto any CPU hotplug operation on those CPUs we've removed
* while the switcher is active.
* We're just not ready to deal with that given the trickery involved.
*/
static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
if (bL_switcher_active) {
int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu];
switch (action & 0xf) {
case CPU_UP_PREPARE:
case CPU_DOWN_PREPARE:
if (pairing == -1)
return NOTIFY_BAD;
}
}
return NOTIFY_DONE;
}
static bool no_bL_switcher;
core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
static int __init bL_switcher_init(void)
{
int ret;
if (MAX_NR_CLUSTERS != 2) {
pr_err("%s: only dual cluster systems are supported\n", __func__);
return -EINVAL;
}
cpu_notifier(bL_switcher_hotplug_callback, 0);
if (!no_bL_switcher) {
ret = bL_switcher_enable();
if (ret)
return ret;
}
#ifdef CONFIG_SYSFS
ret = bL_switcher_sysfs_init();
if (ret)
pr_err("%s: unable to create sysfs entry\n", __func__);
#endif
return 0;
}
late_initcall(bL_switcher_init);
/*
* arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
*
* Created by: Nicolas Pitre, November 2012
* Copyright: (C) 2012-2013 Linaro Limited
*
* Dummy interface to user space for debugging purpose only.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <asm/uaccess.h>
#include <asm/bL_switcher.h>
static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
size_t len, loff_t *pos)
{
unsigned char val[3];
unsigned int cpu, cluster;
int ret;
pr_debug("%s\n", __func__);
if (len < 3)
return -EINVAL;
if (copy_from_user(val, buf, 3))
return -EFAULT;
/* format: <cpu#>,<cluster#> */
if (val[0] < '0' || val[0] > '9' ||
val[1] != ',' ||
val[2] < '0' || val[2] > '1')
return -EINVAL;
cpu = val[0] - '0';
cluster = val[2] - '0';
ret = bL_switch_request(cpu, cluster);
return ret ? : len;
}
static const struct file_operations bL_switcher_fops = {
.write = bL_switcher_write,
.owner = THIS_MODULE,
};
static struct miscdevice bL_switcher_device = {
MISC_DYNAMIC_MINOR,
"b.L_switcher",
&bL_switcher_fops
};
static int __init bL_switcher_dummy_if_init(void)
{
return misc_register(&bL_switcher_device);
}
static void __exit bL_switcher_dummy_if_exit(void)
{
misc_deregister(&bL_switcher_device);
}
module_init(bL_switcher_dummy_if_init);
module_exit(bL_switcher_dummy_if_exit);
...@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr) ...@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
} }
extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
unsigned long poke_phys_addr, unsigned long poke_val)
{
unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
poke[0] = poke_phys_addr;
poke[1] = poke_val;
__cpuc_flush_dcache_area((void *)poke, 8);
outer_clean_range(__pa(poke), __pa(poke + 2));
}
static const struct mcpm_platform_ops *platform_ops; static const struct mcpm_platform_ops *platform_ops;
int __init mcpm_platform_register(const struct mcpm_platform_ops *ops) int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/mcpm.h> #include <asm/mcpm.h>
#include <asm/assembler.h>
#include "vlock.h" #include "vlock.h"
...@@ -47,6 +48,7 @@ ...@@ -47,6 +48,7 @@
ENTRY(mcpm_entry_point) ENTRY(mcpm_entry_point)
ARM_BE8(setend be)
THUMB( adr r12, BSYM(1f) ) THUMB( adr r12, BSYM(1f) )
THUMB( bx r12 ) THUMB( bx r12 )
THUMB( .thumb ) THUMB( .thumb )
...@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point) ...@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point)
* position independent way. * position independent way.
*/ */
adr r5, 3f adr r5, 3f
ldmia r5, {r6, r7, r8, r11} ldmia r5, {r0, r6, r7, r8, r11}
add r0, r5, r0 @ r0 = mcpm_entry_early_pokes
add r6, r5, r6 @ r6 = mcpm_entry_vectors add r6, r5, r6 @ r6 = mcpm_entry_vectors
ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys
add r8, r5, r8 @ r8 = mcpm_sync add r8, r5, r8 @ r8 = mcpm_sync
add r11, r5, r11 @ r11 = first_man_locks add r11, r5, r11 @ r11 = first_man_locks
@ Perform an early poke, if any
add r0, r0, r4, lsl #3
ldmia r0, {r0, r1}
teq r0, #0
strne r1, [r0]
mov r0, #MCPM_SYNC_CLUSTER_SIZE mov r0, #MCPM_SYNC_CLUSTER_SIZE
mla r8, r0, r10, r8 @ r8 = sync cluster base mla r8, r0, r10, r8 @ r8 = sync cluster base
...@@ -195,7 +204,8 @@ mcpm_entry_gated: ...@@ -195,7 +204,8 @@ mcpm_entry_gated:
.align 2 .align 2
3: .word mcpm_entry_vectors - . 3: .word mcpm_entry_early_pokes - .
.word mcpm_entry_vectors - 3b
.word mcpm_power_up_setup_phys - 3b .word mcpm_power_up_setup_phys - 3b
.word mcpm_sync - 3b .word mcpm_sync - 3b
.word first_man_locks - 3b .word first_man_locks - 3b
...@@ -214,6 +224,10 @@ first_man_locks: ...@@ -214,6 +224,10 @@ first_man_locks:
ENTRY(mcpm_entry_vectors) ENTRY(mcpm_entry_vectors)
.space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
.type mcpm_entry_early_pokes, #object
ENTRY(mcpm_entry_early_pokes)
.space 8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
.type mcpm_power_up_setup_phys, #object .type mcpm_power_up_setup_phys, #object
ENTRY(mcpm_power_up_setup_phys) ENTRY(mcpm_power_up_setup_phys)
.space 4 @ set by mcpm_sync_init() .space 4 @ set by mcpm_sync_init()
...@@ -3,7 +3,17 @@ ...@@ -3,7 +3,17 @@
# #
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
aes-arm-y := aes-armv4.o aes_glue.o aes-arm-y := aes-armv4.o aes_glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o aes-arm-bs-y := aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
$(call cmd,perl)
.PRECIOUS: $(obj)/aesbs-core.S
...@@ -6,22 +6,12 @@ ...@@ -6,22 +6,12 @@
#include <linux/crypto.h> #include <linux/crypto.h>
#include <crypto/aes.h> #include <crypto/aes.h>
#define AES_MAXNR 14 #include "aes_glue.h"
typedef struct { EXPORT_SYMBOL(AES_encrypt);
unsigned int rd_key[4 *(AES_MAXNR + 1)]; EXPORT_SYMBOL(AES_decrypt);
int rounds; EXPORT_SYMBOL(private_AES_set_encrypt_key);
} AES_KEY; EXPORT_SYMBOL(private_AES_set_decrypt_key);
struct AES_CTX {
AES_KEY enc_key;
AES_KEY dec_key;
};
asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{ {
...@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = { ...@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
.cipher = { .cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE, .cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE, .cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = aes_set_key, .cia_setkey = aes_set_key,
.cia_encrypt = aes_encrypt, .cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt .cia_decrypt = aes_decrypt
} }
......
#define AES_MAXNR 14
struct AES_KEY {
unsigned int rd_key[4 * (AES_MAXNR + 1)];
int rounds;
};
struct AES_CTX {
struct AES_KEY enc_key;
struct AES_KEY dec_key;
};
asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
const int bits, struct AES_KEY *key);
asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
const int bits, struct AES_KEY *key);
@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@
@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
@ granted.
@ ====================================================================
@ Bit-sliced AES for ARM NEON
@
@ February 2012.
@
@ This implementation is direct adaptation of bsaes-x86_64 module for
@ ARM NEON. Except that this module is endian-neutral [in sense that
@ it can be compiled for either endianness] by courtesy of vld1.8's
@ neutrality. Initial version doesn't implement interface to OpenSSL,
@ only low-level primitives and unsupported entry points, just enough
@ to collect performance results, which for Cortex-A8 core are:
@
@ encrypt 19.5 cycles per byte processed with 128-bit key
@ decrypt 22.1 cycles per byte processed with 128-bit key
@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
@
@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
@ which is [much] worse than anticipated (for further details see
@ http://www.openssl.org/~appro/Snapdragon-S4.html).
@
@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
@ manages in 20.0 cycles].
@
@ When comparing to x86_64 results keep in mind that NEON unit is
@ [mostly] single-issue and thus can't [fully] benefit from
@ instruction-level parallelism. And when comparing to aes-armv4
@ results keep in mind key schedule conversion overhead (see
@ bsaes-x86_64.pl for further details)...
@
@ <appro@openssl.org>
@ April-August 2013
@
@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
@
@ <ard.biesheuvel@linaro.org>
#ifndef __KERNEL__
# include "arm_arch.h"
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
# define VFP_ABI_POP vldmia sp!,{d8-d15}
# define VFP_ABI_FRAME 0x40
#else
# define VFP_ABI_PUSH
# define VFP_ABI_POP
# define VFP_ABI_FRAME 0
# define BSAES_ASM_EXTENDED_KEY
# define XTS_CHAIN_TWEAK
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
#endif
#ifdef __thumb__
# define adrl adr
#endif
#if __ARM_ARCH__>=7
.text
.syntax unified @ ARMv7-capable assembler is expected to handle this
#ifdef __thumb2__
.thumb
#else
.code 32
#endif
.fpu neon
.type _bsaes_decrypt8,%function
.align 4
_bsaes_decrypt8:
adr r6,_bsaes_decrypt8
vldmia r4!, {q9} @ round 0 key
add r6,r6,#.LM0ISR-_bsaes_decrypt8
vldmia r6!, {q8} @ .LM0ISR
veor q10, q0, q9 @ xor with round0 key
veor q11, q1, q9
vtbl.8 d0, {q10}, d16
vtbl.8 d1, {q10}, d17
veor q12, q2, q9
vtbl.8 d2, {q11}, d16
vtbl.8 d3, {q11}, d17
veor q13, q3, q9
vtbl.8 d4, {q12}, d16
vtbl.8 d5, {q12}, d17
veor q14, q4, q9
vtbl.8 d6, {q13}, d16
vtbl.8 d7, {q13}, d17
veor q15, q5, q9
vtbl.8 d8, {q14}, d16
vtbl.8 d9, {q14}, d17
veor q10, q6, q9
vtbl.8 d10, {q15}, d16
vtbl.8 d11, {q15}, d17
veor q11, q7, q9
vtbl.8 d12, {q10}, d16
vtbl.8 d13, {q10}, d17
vtbl.8 d14, {q11}, d16
vtbl.8 d15, {q11}, d17
vmov.i8 q8,#0x55 @ compose .LBS0
vmov.i8 q9,#0x33 @ compose .LBS1
vshr.u64 q10, q6, #1
vshr.u64 q11, q4, #1
veor q10, q10, q7
veor q11, q11, q5
vand q10, q10, q8
vand q11, q11, q8
veor q7, q7, q10
vshl.u64 q10, q10, #1
veor q5, q5, q11
vshl.u64 q11, q11, #1
veor q6, q6, q10
veor q4, q4, q11
vshr.u64 q10, q2, #1
vshr.u64 q11, q0, #1
veor q10, q10, q3
veor q11, q11, q1
vand q10, q10, q8
vand q11, q11, q8
veor q3, q3, q10
vshl.u64 q10, q10, #1
veor q1, q1, q11
vshl.u64 q11, q11, #1
veor q2, q2, q10
veor q0, q0, q11
vmov.i8 q8,#0x0f @ compose .LBS2
vshr.u64 q10, q5, #2
vshr.u64 q11, q4, #2
veor q10, q10, q7
veor q11, q11, q6
vand q10, q10, q9
vand q11, q11, q9
veor q7, q7, q10
vshl.u64 q10, q10, #2
veor q6, q6, q11
vshl.u64 q11, q11, #2
veor q5, q5, q10
veor q4, q4, q11
vshr.u64 q10, q1, #2
vshr.u64 q11, q0, #2
veor q10, q10, q3
veor q11, q11, q2
vand q10, q10, q9
vand q11, q11, q9
veor q3, q3, q10
vshl.u64 q10, q10, #2
veor q2, q2, q11
vshl.u64 q11, q11, #2
veor q1, q1, q10
veor q0, q0, q11
vshr.u64 q10, q3, #4
vshr.u64 q11, q2, #4
veor q10, q10, q7
veor q11, q11, q6
vand q10, q10, q8
vand q11, q11, q8
veor q7, q7, q10
vshl.u64 q10, q10, #4
veor q6, q6, q11
vshl.u64 q11, q11, #4
veor q3, q3, q10
veor q2, q2, q11
vshr.u64 q10, q1, #4
vshr.u64 q11, q0, #4
veor q10, q10, q5
veor q11, q11, q4
vand q10, q10, q8
vand q11, q11, q8
veor q5, q5, q10
vshl.u64 q10, q10, #4
veor q4, q4, q11
vshl.u64 q11, q11, #4
veor q1, q1, q10
veor q0, q0, q11
sub r5,r5,#1
b .Ldec_sbox
.align 4
.Ldec_loop:
vldmia r4!, {q8-q11}
veor q8, q8, q0
veor q9, q9, q1
vtbl.8 d0, {q8}, d24
vtbl.8 d1, {q8}, d25
vldmia r4!, {q8}
veor q10, q10, q2
vtbl.8 d2, {q9}, d24
vtbl.8 d3, {q9}, d25
vldmia r4!, {q9}
veor q11, q11, q3
vtbl.8 d4, {q10}, d24
vtbl.8 d5, {q10}, d25
vldmia r4!, {q10}
vtbl.8 d6, {q11}, d24
vtbl.8 d7, {q11}, d25
vldmia r4!, {q11}
veor q8, q8, q4
veor q9, q9, q5
vtbl.8 d8, {q8}, d24
vtbl.8 d9, {q8}, d25
veor q10, q10, q6
vtbl.8 d10, {q9}, d24
vtbl.8 d11, {q9}, d25
veor q11, q11, q7
vtbl.8 d12, {q10}, d24
vtbl.8 d13, {q10}, d25
vtbl.8 d14, {q11}, d24
vtbl.8 d15, {q11}, d25
.Ldec_sbox:
veor q1, q1, q4
veor q3, q3, q4
veor q4, q4, q7
veor q1, q1, q6
veor q2, q2, q7
veor q6, q6, q4
veor q0, q0, q1
veor q2, q2, q5
veor q7, q7, q6
veor q3, q3, q0
veor q5, q5, q0
veor q1, q1, q3
veor q11, q3, q0
veor q10, q7, q4
veor q9, q1, q6
veor q13, q4, q0
vmov q8, q10
veor q12, q5, q2
vorr q10, q10, q9
veor q15, q11, q8
vand q14, q11, q12
vorr q11, q11, q12
veor q12, q12, q9
vand q8, q8, q9
veor q9, q6, q2
vand q15, q15, q12
vand q13, q13, q9
veor q9, q3, q7
veor q12, q1, q5
veor q11, q11, q13
veor q10, q10, q13
vand q13, q9, q12
vorr q9, q9, q12
veor q11, q11, q15
veor q8, q8, q13
veor q10, q10, q14
veor q9, q9, q15
veor q8, q8, q14
vand q12, q4, q6
veor q9, q9, q14
vand q13, q0, q2
vand q14, q7, q1
vorr q15, q3, q5
veor q11, q11, q12
veor q9, q9, q14
veor q8, q8, q15
veor q10, q10, q13
@ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
@ new smaller inversion
vand q14, q11, q9
vmov q12, q8
veor q13, q10, q14
veor q15, q8, q14
veor q14, q8, q14 @ q14=q15
vbsl q13, q9, q8
vbsl q15, q11, q10
veor q11, q11, q10
vbsl q12, q13, q14
vbsl q8, q14, q13
vand q14, q12, q15
veor q9, q9, q8
veor q14, q14, q11
veor q12, q5, q2
veor q8, q1, q6
veor q10, q15, q14
vand q10, q10, q5
veor q5, q5, q1
vand q11, q1, q15
vand q5, q5, q14
veor q1, q11, q10
veor q5, q5, q11
veor q15, q15, q13
veor q14, q14, q9
veor q11, q15, q14
veor q10, q13, q9
vand q11, q11, q12
vand q10, q10, q2
veor q12, q12, q8
veor q2, q2, q6
vand q8, q8, q15
vand q6, q6, q13
vand q12, q12, q14
vand q2, q2, q9
veor q8, q8, q12
veor q2, q2, q6
veor q12, q12, q11
veor q6, q6, q10
veor q5, q5, q12
veor q2, q2, q12
veor q1, q1, q8
veor q6, q6, q8
veor q12, q3, q0
veor q8, q7, q4
veor q11, q15, q14
veor q10, q13, q9
vand q11, q11, q12
vand q10, q10, q0
veor q12, q12, q8
veor q0, q0, q4
vand q8, q8, q15
vand q4, q4, q13
vand q12, q12, q14
vand q0, q0, q9
veor q8, q8, q12
veor q0, q0, q4
veor q12, q12, q11
veor q4, q4, q10
veor q15, q15, q13
veor q14, q14, q9
veor q10, q15, q14
vand q10, q10, q3
veor q3, q3, q7
vand q11, q7, q15
vand q3, q3, q14
veor q7, q11, q10
veor q3, q3, q11
veor q3, q3, q12
veor q0, q0, q12
veor q7, q7, q8
veor q4, q4, q8
veor q1, q1, q7
veor q6, q6, q5
veor q4, q4, q1
veor q2, q2, q7
veor q5, q5, q7
veor q4, q4, q2
veor q7, q7, q0
veor q4, q4, q5
veor q3, q3, q6
veor q6, q6, q1
veor q3, q3, q4
veor q4, q4, q0
veor q7, q7, q3
subs r5,r5,#1
bcc .Ldec_done
@ multiplication by 0x05-0x00-0x04-0x00
vext.8 q8, q0, q0, #8
vext.8 q14, q3, q3, #8
vext.8 q15, q5, q5, #8
veor q8, q8, q0
vext.8 q9, q1, q1, #8
veor q14, q14, q3
vext.8 q10, q6, q6, #8
veor q15, q15, q5
vext.8 q11, q4, q4, #8
veor q9, q9, q1
vext.8 q12, q2, q2, #8
veor q10, q10, q6
vext.8 q13, q7, q7, #8
veor q11, q11, q4
veor q12, q12, q2
veor q13, q13, q7
veor q0, q0, q14
veor q1, q1, q14
veor q6, q6, q8
veor q2, q2, q10
veor q4, q4, q9
veor q1, q1, q15
veor q6, q6, q15
veor q2, q2, q14
veor q7, q7, q11
veor q4, q4, q14
veor q3, q3, q12
veor q2, q2, q15
veor q7, q7, q15
veor q5, q5, q13
vext.8 q8, q0, q0, #12 @ x0 <<< 32
vext.8 q9, q1, q1, #12
veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
vext.8 q10, q6, q6, #12
veor q1, q1, q9
vext.8 q11, q4, q4, #12
veor q6, q6, q10
vext.8 q12, q2, q2, #12
veor q4, q4, q11
vext.8 q13, q7, q7, #12
veor q2, q2, q12
vext.8 q14, q3, q3, #12
veor q7, q7, q13
vext.8 q15, q5, q5, #12
veor q3, q3, q14
veor q9, q9, q0
veor q5, q5, q15
vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
veor q10, q10, q1
veor q8, q8, q5
veor q9, q9, q5
vext.8 q1, q1, q1, #8
veor q13, q13, q2
veor q0, q0, q8
veor q14, q14, q7
veor q1, q1, q9
vext.8 q8, q2, q2, #8
veor q12, q12, q4
vext.8 q9, q7, q7, #8
veor q15, q15, q3
vext.8 q2, q4, q4, #8
veor q11, q11, q6
vext.8 q7, q5, q5, #8
veor q12, q12, q5
vext.8 q4, q3, q3, #8
veor q11, q11, q5
vext.8 q3, q6, q6, #8
veor q5, q9, q13
veor q11, q11, q2
veor q7, q7, q15
veor q6, q4, q14
veor q4, q8, q12
veor q2, q3, q10
vmov q3, q11
@ vmov q5, q9
vldmia r6, {q12} @ .LISR
ite eq @ Thumb2 thing, sanity check in ARM
addeq r6,r6,#0x10
bne .Ldec_loop
vldmia r6, {q12} @ .LISRM0
b .Ldec_loop
.align 4
.Ldec_done:
vmov.i8 q8,#0x55 @ compose .LBS0
vmov.i8 q9,#0x33 @ compose .LBS1
vshr.u64 q10, q3, #1
vshr.u64 q11, q2, #1
veor q10, q10, q5
veor q11, q11, q7
vand q10, q10, q8
vand q11, q11, q8
veor q5, q5, q10
vshl.u64 q10, q10, #1
veor q7, q7, q11
vshl.u64 q11, q11, #1
veor q3, q3, q10
veor q2, q2, q11
vshr.u64 q10, q6, #1
vshr.u64 q11, q0, #1
veor q10, q10, q4
veor q11, q11, q1
vand q10, q10, q8
vand q11, q11, q8
veor q4, q4, q10
vshl.u64 q10, q10, #1
veor q1, q1, q11
vshl.u64 q11, q11, #1
veor q6, q6, q10
veor q0, q0, q11
vmov.i8 q8,#0x0f @ compose .LBS2
vshr.u64 q10, q7, #2
vshr.u64 q11, q2, #2
veor q10, q10, q5
veor q11, q11, q3
vand q10, q10, q9
vand q11, q11, q9
veor q5, q5, q10
vshl.u64 q10, q10, #2
veor q3, q3, q11
vshl.u64 q11, q11, #2
veor q7, q7, q10
veor q2, q2, q11
vshr.u64 q10, q1, #2
vshr.u64 q11, q0, #2
veor q10, q10, q4
veor q11, q11, q6
vand q10, q10, q9
vand q11, q11, q9
veor q4, q4, q10
vshl.u64 q10, q10, #2
veor q6, q6, q11
vshl.u64 q11, q11, #2
veor q1, q1, q10
veor q0, q0, q11
vshr.u64 q10, q4, #4
vshr.u64 q11, q6, #4
veor q10, q10, q5
veor q11, q11, q3
vand q10, q10, q8
vand q11, q11, q8
veor q5, q5, q10
vshl.u64 q10, q10, #4
veor q3, q3, q11
vshl.u64 q11, q11, #4
veor q4, q4, q10
veor q6, q6, q11
vshr.u64 q10, q1, #4
vshr.u64 q11, q0, #4
veor q10, q10, q7
veor q11, q11, q2
vand q10, q10, q8
vand q11, q11, q8
veor q7, q7, q10
vshl.u64 q10, q10, #4
veor q2, q2, q11
vshl.u64 q11, q11, #4
veor q1, q1, q10
veor q0, q0, q11
vldmia r4, {q8} @ last round key
veor q6, q6, q8
veor q4, q4, q8
veor q2, q2, q8
veor q7, q7, q8
veor q3, q3, q8
veor q5, q5, q8
veor q0, q0, q8
veor q1, q1, q8
bx lr
.size _bsaes_decrypt8,.-_bsaes_decrypt8
.type _bsaes_const,%object
.align 6
_bsaes_const:
.LM0ISR: @ InvShiftRows constants
.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
.LISR:
.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
.LISRM0:
.quad 0x01040b0e0205080f, 0x0306090c00070a0d
.LM0SR: @ ShiftRows constants
.quad 0x0a0e02060f03070b, 0x0004080c05090d01
.LSR:
.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
.LSRM0:
.quad 0x0304090e00050a0f, 0x01060b0c0207080d
.LM0:
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
.LREVM0SR:
.quad 0x090d01050c000408, 0x03070b0f060a0e02
.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
.align 6
.size _bsaes_const,.-_bsaes_const
.type _bsaes_encrypt8,%function
.align 4
_bsaes_encrypt8:
adr r6,_bsaes_encrypt8
vldmia r4!, {q9} @ round 0 key
sub r6,r6,#_bsaes_encrypt8-.LM0SR
vldmia r6!, {q8} @ .LM0SR
_bsaes_encrypt8_alt:
veor q10, q0, q9 @ xor with round0 key
veor q11, q1, q9
vtbl.8 d0, {q10}, d16
vtbl.8 d1, {q10}, d17
veor q12, q2, q9
vtbl.8 d2, {q11}, d16
vtbl.8 d3, {q11}, d17
veor q13, q3, q9
vtbl.8 d4, {q12}, d16
vtbl.8 d5, {q12}, d17
veor q14, q4, q9
vtbl.8 d6, {q13}, d16
vtbl.8 d7, {q13}, d17
veor q15, q5, q9
vtbl.8 d8, {q14}, d16
vtbl.8 d9, {q14}, d17
veor q10, q6, q9
vtbl.8 d10, {q15}, d16
vtbl.8 d11, {q15}, d17
veor q11, q7, q9
vtbl.8 d12, {q10}, d16
vtbl.8 d13, {q10}, d17
vtbl.8 d14, {q11}, d16
vtbl.8 d15, {q11}, d17
_bsaes_encrypt8_bitslice:
vmov.i8 q8,#0x55 @ compose .LBS0
vmov.i8 q9,#0x33 @ compose .LBS1
vshr.u64 q10, q6, #1
vshr.u64 q11, q4, #1
veor q10, q10, q7
veor q11, q11, q5
vand q10, q10, q8
vand q11, q11, q8
veor q7, q7, q10
vshl.u64 q10, q10, #1
veor q5, q5, q11
vshl.u64 q11, q11, #1
veor q6, q6, q10
veor q4, q4, q11
vshr.u64 q10, q2, #1
vshr.u64 q11, q0, #1
veor q10, q10, q3
veor q11, q11, q1
vand q10, q10, q8
vand q11, q11, q8
veor q3, q3, q10
vshl.u64 q10, q10, #1
veor q1, q1, q11
vshl.u64 q11, q11, #1
veor q2, q2, q10
veor q0, q0, q11
vmov.i8 q8,#0x0f @ compose .LBS2
vshr.u64 q10, q5, #2
vshr.u64 q11, q4, #2
veor q10, q10, q7
veor q11, q11, q6
vand q10, q10, q9
vand q11, q11, q9
veor q7, q7, q10
vshl.u64 q10, q10, #2
veor q6, q6, q11
vshl.u64 q11, q11, #2
veor q5, q5, q10
veor q4, q4, q11
vshr.u64 q10, q1, #2
vshr.u64 q11, q0, #2
veor q10, q10, q3
veor q11, q11, q2
vand q10, q10, q9
vand q11, q11, q9
veor q3, q3, q10
vshl.u64 q10, q10, #2
veor q2, q2, q11
vshl.u64 q11, q11, #2
veor q1, q1, q10
veor q0, q0, q11
vshr.u64 q10, q3, #4
vshr.u64 q11, q2, #4
veor q10, q10, q7
veor q11, q11, q6
vand q10, q10, q8
vand q11, q11, q8
veor q7, q7, q10
vshl.u64 q10, q10, #4
veor q6, q6, q11
vshl.u64 q11, q11, #4
veor q3, q3, q10
veor q2, q2, q11
vshr.u64 q10, q1, #4
vshr.u64 q11, q0, #4
veor q10, q10, q5
veor q11, q11, q4
vand q10, q10, q8
vand q11, q11, q8
veor q5, q5, q10
vshl.u64 q10, q10, #4
veor q4, q4, q11
vshl.u64 q11, q11, #4
veor q1, q1, q10
veor q0, q0, q11
sub r5,r5,#1
b .Lenc_sbox
.align 4
.Lenc_loop:
vldmia r4!, {q8-q11}
veor q8, q8, q0
veor q9, q9, q1
vtbl.8 d0, {q8}, d24
vtbl.8 d1, {q8}, d25
vldmia r4!, {q8}
veor q10, q10, q2
vtbl.8 d2, {q9}, d24
vtbl.8 d3, {q9}, d25
vldmia r4!, {q9}
veor q11, q11, q3
vtbl.8 d4, {q10}, d24
vtbl.8 d5, {q10}, d25
vldmia r4!, {q10}
vtbl.8 d6, {q11}, d24
vtbl.8 d7, {q11}, d25
vldmia r4!, {q11}
veor q8, q8, q4
veor q9, q9, q5
vtbl.8 d8, {q8}, d24
vtbl.8 d9, {q8}, d25
veor q10, q10, q6
vtbl.8 d10, {q9}, d24
vtbl.8 d11, {q9}, d25
veor q11, q11, q7
vtbl.8 d12, {q10}, d24
vtbl.8 d13, {q10}, d25
vtbl.8 d14, {q11}, d24
vtbl.8 d15, {q11}, d25
.Lenc_sbox:
veor q2, q2, q1
veor q5, q5, q6
veor q3, q3, q0
veor q6, q6, q2
veor q5, q5, q0
veor q6, q6, q3
veor q3, q3, q7
veor q7, q7, q5
veor q3, q3, q4
veor q4, q4, q5
veor q2, q2, q7
veor q3, q3, q1
veor q1, q1, q5
veor q11, q7, q4
veor q10, q1, q2
veor q9, q5, q3
veor q13, q2, q4
vmov q8, q10
veor q12, q6, q0
vorr q10, q10, q9
veor q15, q11, q8
vand q14, q11, q12
vorr q11, q11, q12
veor q12, q12, q9
vand q8, q8, q9
veor q9, q3, q0
vand q15, q15, q12
vand q13, q13, q9
veor q9, q7, q1
veor q12, q5, q6
veor q11, q11, q13
veor q10, q10, q13
vand q13, q9, q12
vorr q9, q9, q12
veor q11, q11, q15
veor q8, q8, q13
veor q10, q10, q14
veor q9, q9, q15
veor q8, q8, q14
vand q12, q2, q3
veor q9, q9, q14
vand q13, q4, q0
vand q14, q1, q5
vorr q15, q7, q6
veor q11, q11, q12
veor q9, q9, q14
veor q8, q8, q15
veor q10, q10, q13
@ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
@ new smaller inversion
vand q14, q11, q9
vmov q12, q8
veor q13, q10, q14
veor q15, q8, q14
veor q14, q8, q14 @ q14=q15
vbsl q13, q9, q8
vbsl q15, q11, q10
veor q11, q11, q10
vbsl q12, q13, q14
vbsl q8, q14, q13
vand q14, q12, q15
veor q9, q9, q8
veor q14, q14, q11
veor q12, q6, q0
veor q8, q5, q3
veor q10, q15, q14
vand q10, q10, q6
veor q6, q6, q5
vand q11, q5, q15
vand q6, q6, q14
veor q5, q11, q10
veor q6, q6, q11
veor q15, q15, q13
veor q14, q14, q9
veor q11, q15, q14
veor q10, q13, q9
vand q11, q11, q12
vand q10, q10, q0
veor q12, q12, q8
veor q0, q0, q3
vand q8, q8, q15
vand q3, q3, q13
vand q12, q12, q14
vand q0, q0, q9
veor q8, q8, q12
veor q0, q0, q3
veor q12, q12, q11
veor q3, q3, q10
veor q6, q6, q12
veor q0, q0, q12
veor q5, q5, q8
veor q3, q3, q8
veor q12, q7, q4
veor q8, q1, q2
veor q11, q15, q14
veor q10, q13, q9
vand q11, q11, q12
vand q10, q10, q4
veor q12, q12, q8
veor q4, q4, q2
vand q8, q8, q15
vand q2, q2, q13
vand q12, q12, q14
vand q4, q4, q9
veor q8, q8, q12
veor q4, q4, q2
veor q12, q12, q11
veor q2, q2, q10
veor q15, q15, q13
veor q14, q14, q9
veor q10, q15, q14
vand q10, q10, q7
veor q7, q7, q1
vand q11, q1, q15
vand q7, q7, q14
veor q1, q11, q10
veor q7, q7, q11
veor q7, q7, q12
veor q4, q4, q12
veor q1, q1, q8
veor q2, q2, q8
veor q7, q7, q0
veor q1, q1, q6
veor q6, q6, q0
veor q4, q4, q7
veor q0, q0, q1
veor q1, q1, q5
veor q5, q5, q2
veor q2, q2, q3
veor q3, q3, q5
veor q4, q4, q5
veor q6, q6, q3
subs r5,r5,#1
bcc .Lenc_done
vext.8 q8, q0, q0, #12 @ x0 <<< 32
vext.8 q9, q1, q1, #12
veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
vext.8 q10, q4, q4, #12
veor q1, q1, q9
vext.8 q11, q6, q6, #12
veor q4, q4, q10
vext.8 q12, q3, q3, #12
veor q6, q6, q11
vext.8 q13, q7, q7, #12
veor q3, q3, q12
vext.8 q14, q2, q2, #12
veor q7, q7, q13
vext.8 q15, q5, q5, #12
veor q2, q2, q14
veor q9, q9, q0
veor q5, q5, q15
vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
veor q10, q10, q1
veor q8, q8, q5
veor q9, q9, q5
vext.8 q1, q1, q1, #8
veor q13, q13, q3
veor q0, q0, q8
veor q14, q14, q7
veor q1, q1, q9
vext.8 q8, q3, q3, #8
veor q12, q12, q6
vext.8 q9, q7, q7, #8
veor q15, q15, q2
vext.8 q3, q6, q6, #8
veor q11, q11, q4
vext.8 q7, q5, q5, #8
veor q12, q12, q5
vext.8 q6, q2, q2, #8
veor q11, q11, q5
vext.8 q2, q4, q4, #8
veor q5, q9, q13
veor q4, q8, q12
veor q3, q3, q11
veor q7, q7, q15
veor q6, q6, q14
@ vmov q4, q8
veor q2, q2, q10
@ vmov q5, q9
vldmia r6, {q12} @ .LSR
ite eq @ Thumb2 thing, samity check in ARM
addeq r6,r6,#0x10
bne .Lenc_loop
vldmia r6, {q12} @ .LSRM0
b .Lenc_loop
.align 4
.Lenc_done:
vmov.i8 q8,#0x55 @ compose .LBS0
vmov.i8 q9,#0x33 @ compose .LBS1
vshr.u64 q10, q2, #1
vshr.u64 q11, q3, #1
veor q10, q10, q5
veor q11, q11, q7
vand q10, q10, q8
vand q11, q11, q8
veor q5, q5, q10
vshl.u64 q10, q10, #1
veor q7, q7, q11
vshl.u64 q11, q11, #1
veor q2, q2, q10
veor q3, q3, q11
vshr.u64 q10, q4, #1
vshr.u64 q11, q0, #1
veor q10, q10, q6
veor q11, q11, q1
vand q10, q10, q8
vand q11, q11, q8
veor q6, q6, q10
vshl.u64 q10, q10, #1
veor q1, q1, q11
vshl.u64 q11, q11, #1
veor q4, q4, q10
veor q0, q0, q11
vmov.i8 q8,#0x0f @ compose .LBS2
vshr.u64 q10, q7, #2
vshr.u64 q11, q3, #2
veor q10, q10, q5
veor q11, q11, q2
vand q10, q10, q9
vand q11, q11, q9
veor q5, q5, q10
vshl.u64 q10, q10, #2
veor q2, q2, q11
vshl.u64 q11, q11, #2
veor q7, q7, q10
veor q3, q3, q11
vshr.u64 q10, q1, #2
vshr.u64 q11, q0, #2
veor q10, q10, q6
veor q11, q11, q4
vand q10, q10, q9
vand q11, q11, q9
veor q6, q6, q10
vshl.u64 q10, q10, #2
veor q4, q4, q11
vshl.u64 q11, q11, #2
veor q1, q1, q10
veor q0, q0, q11
vshr.u64 q10, q6, #4
vshr.u64 q11, q4, #4
veor q10, q10, q5
veor q11, q11, q2
vand q10, q10, q8
vand q11, q11, q8
veor q5, q5, q10
vshl.u64 q10, q10, #4
veor q2, q2, q11
vshl.u64 q11, q11, #4
veor q6, q6, q10
veor q4, q4, q11
vshr.u64 q10, q1, #4
vshr.u64 q11, q0, #4
veor q10, q10, q7
veor q11, q11, q3
vand q10, q10, q8
vand q11, q11, q8
veor q7, q7, q10
vshl.u64 q10, q10, #4
veor q3, q3, q11
vshl.u64 q11, q11, #4
veor q1, q1, q10
veor q0, q0, q11
vldmia r4, {q8} @ last round key
veor q4, q4, q8
veor q6, q6, q8
veor q3, q3, q8
veor q7, q7, q8
veor q2, q2, q8
veor q5, q5, q8
veor q0, q0, q8
veor q1, q1, q8
bx lr
.size _bsaes_encrypt8,.-_bsaes_encrypt8
.type _bsaes_key_convert,%function
.align 4
_bsaes_key_convert:
adr r6,_bsaes_key_convert
vld1.8 {q7}, [r4]! @ load round 0 key
sub r6,r6,#_bsaes_key_convert-.LM0
vld1.8 {q15}, [r4]! @ load round 1 key
vmov.i8 q8, #0x01 @ bit masks
vmov.i8 q9, #0x02
vmov.i8 q10, #0x04
vmov.i8 q11, #0x08
vmov.i8 q12, #0x10
vmov.i8 q13, #0x20
vldmia r6, {q14} @ .LM0
#ifdef __ARMEL__
vrev32.8 q7, q7
vrev32.8 q15, q15
#endif
sub r5,r5,#1
vstmia r12!, {q7} @ save round 0 key
b .Lkey_loop
.align 4
.Lkey_loop:
vtbl.8 d14,{q15},d28
vtbl.8 d15,{q15},d29
vmov.i8 q6, #0x40
vmov.i8 q15, #0x80
vtst.8 q0, q7, q8
vtst.8 q1, q7, q9
vtst.8 q2, q7, q10
vtst.8 q3, q7, q11
vtst.8 q4, q7, q12
vtst.8 q5, q7, q13
vtst.8 q6, q7, q6
vtst.8 q7, q7, q15
vld1.8 {q15}, [r4]! @ load next round key
vmvn q0, q0 @ "pnot"
vmvn q1, q1
vmvn q5, q5
vmvn q6, q6
#ifdef __ARMEL__
vrev32.8 q15, q15
#endif
subs r5,r5,#1
vstmia r12!,{q0-q7} @ write bit-sliced round key
bne .Lkey_loop
vmov.i8 q7,#0x63 @ compose .L63
@ don't save last round key
bx lr
.size _bsaes_key_convert,.-_bsaes_key_convert
.extern AES_cbc_encrypt
.extern AES_decrypt
.global bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,%function
.align 5
bsaes_cbc_encrypt:
#ifndef __KERNEL__
cmp r2, #128
#ifndef __thumb__
blo AES_cbc_encrypt
#else
bhs 1f
b AES_cbc_encrypt
1:
#endif
#endif
@ it is up to the caller to make sure we are called with enc == 0
mov ip, sp
stmdb sp!, {r4-r10, lr}
VFP_ABI_PUSH
ldr r8, [ip] @ IV is 1st arg on the stack
mov r2, r2, lsr#4 @ len in 16 byte blocks
sub sp, #0x10 @ scratch space to carry over the IV
mov r9, sp @ save sp
ldr r10, [r3, #240] @ get # of rounds
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
add r12, #96 @ sifze of bit-slices key schedule
@ populate the key schedule
mov r4, r3 @ pass key
mov r5, r10 @ pass # of rounds
mov sp, r12 @ sp is sp
bl _bsaes_key_convert
vldmia sp, {q6}
vstmia r12, {q15} @ save last round key
veor q7, q7, q6 @ fix up round 0 key
vstmia sp, {q7}
#else
ldr r12, [r3, #244]
eors r12, #1
beq 0f
@ populate the key schedule
str r12, [r3, #244]
mov r4, r3 @ pass key
mov r5, r10 @ pass # of rounds
add r12, r3, #248 @ pass key schedule
bl _bsaes_key_convert
add r4, r3, #248
vldmia r4, {q6}
vstmia r12, {q15} @ save last round key
veor q7, q7, q6 @ fix up round 0 key
vstmia r4, {q7}
.align 2
0:
#endif
vld1.8 {q15}, [r8] @ load IV
b .Lcbc_dec_loop
.align 4
.Lcbc_dec_loop:
subs r2, r2, #0x8
bmi .Lcbc_dec_loop_finish
vld1.8 {q0-q1}, [r0]! @ load input
vld1.8 {q2-q3}, [r0]!
#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, sp @ pass the key
#else
add r4, r3, #248
#endif
vld1.8 {q4-q5}, [r0]!
mov r5, r10
vld1.8 {q6-q7}, [r0]
sub r0, r0, #0x60
vstmia r9, {q15} @ put aside IV
bl _bsaes_decrypt8
vldmia r9, {q14} @ reload IV
vld1.8 {q8-q9}, [r0]! @ reload input
veor q0, q0, q14 @ ^= IV
vld1.8 {q10-q11}, [r0]!
veor q1, q1, q8
veor q6, q6, q9
vld1.8 {q12-q13}, [r0]!
veor q4, q4, q10
veor q2, q2, q11
vld1.8 {q14-q15}, [r0]!
veor q7, q7, q12
vst1.8 {q0-q1}, [r1]! @ write output
veor q3, q3, q13
vst1.8 {q6}, [r1]!
veor q5, q5, q14
vst1.8 {q4}, [r1]!
vst1.8 {q2}, [r1]!
vst1.8 {q7}, [r1]!
vst1.8 {q3}, [r1]!
vst1.8 {q5}, [r1]!
b .Lcbc_dec_loop
.Lcbc_dec_loop_finish:
adds r2, r2, #8
beq .Lcbc_dec_done
vld1.8 {q0}, [r0]! @ load input
cmp r2, #2
blo .Lcbc_dec_one
vld1.8 {q1}, [r0]!
#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, sp @ pass the key
#else
add r4, r3, #248
#endif
mov r5, r10
vstmia r9, {q15} @ put aside IV
beq .Lcbc_dec_two
vld1.8 {q2}, [r0]!
cmp r2, #4
blo .Lcbc_dec_three
vld1.8 {q3}, [r0]!
beq .Lcbc_dec_four
vld1.8 {q4}, [r0]!
cmp r2, #6
blo .Lcbc_dec_five
vld1.8 {q5}, [r0]!
beq .Lcbc_dec_six
vld1.8 {q6}, [r0]!
sub r0, r0, #0x70
bl _bsaes_decrypt8
vldmia r9, {q14} @ reload IV
vld1.8 {q8-q9}, [r0]! @ reload input
veor q0, q0, q14 @ ^= IV
vld1.8 {q10-q11}, [r0]!
veor q1, q1, q8
veor q6, q6, q9
vld1.8 {q12-q13}, [r0]!
veor q4, q4, q10
veor q2, q2, q11
vld1.8 {q15}, [r0]!
veor q7, q7, q12
vst1.8 {q0-q1}, [r1]! @ write output
veor q3, q3, q13
vst1.8 {q6}, [r1]!
vst1.8 {q4}, [r1]!
vst1.8 {q2}, [r1]!
vst1.8 {q7}, [r1]!
vst1.8 {q3}, [r1]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_six:
sub r0, r0, #0x60
bl _bsaes_decrypt8
vldmia r9,{q14} @ reload IV
vld1.8 {q8-q9}, [r0]! @ reload input
veor q0, q0, q14 @ ^= IV
vld1.8 {q10-q11}, [r0]!
veor q1, q1, q8
veor q6, q6, q9
vld1.8 {q12}, [r0]!
veor q4, q4, q10
veor q2, q2, q11
vld1.8 {q15}, [r0]!
veor q7, q7, q12
vst1.8 {q0-q1}, [r1]! @ write output
vst1.8 {q6}, [r1]!
vst1.8 {q4}, [r1]!
vst1.8 {q2}, [r1]!
vst1.8 {q7}, [r1]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_five:
sub r0, r0, #0x50
bl _bsaes_decrypt8
vldmia r9, {q14} @ reload IV
vld1.8 {q8-q9}, [r0]! @ reload input
veor q0, q0, q14 @ ^= IV
vld1.8 {q10-q11}, [r0]!
veor q1, q1, q8
veor q6, q6, q9
vld1.8 {q15}, [r0]!
veor q4, q4, q10
vst1.8 {q0-q1}, [r1]! @ write output
veor q2, q2, q11
vst1.8 {q6}, [r1]!
vst1.8 {q4}, [r1]!
vst1.8 {q2}, [r1]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_four:
sub r0, r0, #0x40
bl _bsaes_decrypt8
vldmia r9, {q14} @ reload IV
vld1.8 {q8-q9}, [r0]! @ reload input
veor q0, q0, q14 @ ^= IV
vld1.8 {q10}, [r0]!
veor q1, q1, q8
veor q6, q6, q9
vld1.8 {q15}, [r0]!
veor q4, q4, q10
vst1.8 {q0-q1}, [r1]! @ write output
vst1.8 {q6}, [r1]!
vst1.8 {q4}, [r1]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_three:
sub r0, r0, #0x30
bl _bsaes_decrypt8
vldmia r9, {q14} @ reload IV
vld1.8 {q8-q9}, [r0]! @ reload input
veor q0, q0, q14 @ ^= IV
vld1.8 {q15}, [r0]!
veor q1, q1, q8
veor q6, q6, q9
vst1.8 {q0-q1}, [r1]! @ write output
vst1.8 {q6}, [r1]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_two:
sub r0, r0, #0x20
bl _bsaes_decrypt8
vldmia r9, {q14} @ reload IV
vld1.8 {q8}, [r0]! @ reload input
veor q0, q0, q14 @ ^= IV
vld1.8 {q15}, [r0]! @ reload input
veor q1, q1, q8
vst1.8 {q0-q1}, [r1]! @ write output
b .Lcbc_dec_done
.align 4
.Lcbc_dec_one:
sub r0, r0, #0x10
mov r10, r1 @ save original out pointer
mov r1, r9 @ use the iv scratch space as out buffer
mov r2, r3
vmov q4,q15 @ just in case ensure that IV
vmov q5,q0 @ and input are preserved
bl AES_decrypt
vld1.8 {q0}, [r9,:64] @ load result
veor q0, q0, q4 @ ^= IV
vmov q15, q5 @ q5 holds input
vst1.8 {q0}, [r10] @ write output
.Lcbc_dec_done:
#ifndef BSAES_ASM_EXTENDED_KEY
vmov.i32 q0, #0
vmov.i32 q1, #0
.Lcbc_dec_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
cmp sp, r9
bne .Lcbc_dec_bzero
#endif
mov sp, r9
add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
vst1.8 {q15}, [r8] @ return IV
VFP_ABI_POP
ldmia sp!, {r4-r10, pc}
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
.extern AES_encrypt
.global bsaes_ctr32_encrypt_blocks
.type bsaes_ctr32_encrypt_blocks,%function
.align 5
bsaes_ctr32_encrypt_blocks:
cmp r2, #8 @ use plain AES for
blo .Lctr_enc_short @ small sizes
mov ip, sp
stmdb sp!, {r4-r10, lr}
VFP_ABI_PUSH
ldr r8, [ip] @ ctr is 1st arg on the stack
sub sp, sp, #0x10 @ scratch space to carry over the ctr
mov r9, sp @ save sp
ldr r10, [r3, #240] @ get # of rounds
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
add r12, #96 @ size of bit-sliced key schedule
@ populate the key schedule
mov r4, r3 @ pass key
mov r5, r10 @ pass # of rounds
mov sp, r12 @ sp is sp
bl _bsaes_key_convert
veor q7,q7,q15 @ fix up last round key
vstmia r12, {q7} @ save last round key
vld1.8 {q0}, [r8] @ load counter
add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
vldmia sp, {q4} @ load round0 key
#else
ldr r12, [r3, #244]
eors r12, #1
beq 0f
@ populate the key schedule
str r12, [r3, #244]
mov r4, r3 @ pass key
mov r5, r10 @ pass # of rounds
add r12, r3, #248 @ pass key schedule
bl _bsaes_key_convert
veor q7,q7,q15 @ fix up last round key
vstmia r12, {q7} @ save last round key
.align 2
0: add r12, r3, #248
vld1.8 {q0}, [r8] @ load counter
adrl r8, .LREVM0SR @ borrow r8
vldmia r12, {q4} @ load round0 key
sub sp, #0x10 @ place for adjusted round0 key
#endif
vmov.i32 q8,#1 @ compose 1<<96
veor q9,q9,q9
vrev32.8 q0,q0
vext.8 q8,q9,q8,#4
vrev32.8 q4,q4
vadd.u32 q9,q8,q8 @ compose 2<<96
vstmia sp, {q4} @ save adjusted round0 key
b .Lctr_enc_loop
.align 4
.Lctr_enc_loop:
vadd.u32 q10, q8, q9 @ compose 3<<96
vadd.u32 q1, q0, q8 @ +1
vadd.u32 q2, q0, q9 @ +2
vadd.u32 q3, q0, q10 @ +3
vadd.u32 q4, q1, q10
vadd.u32 q5, q2, q10
vadd.u32 q6, q3, q10
vadd.u32 q7, q4, q10
vadd.u32 q10, q5, q10 @ next counter
@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
@ to flip byte order in 32-bit counter
vldmia sp, {q9} @ load round0 key
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x10 @ pass next round key
#else
add r4, r3, #264
#endif
vldmia r8, {q8} @ .LREVM0SR
mov r5, r10 @ pass rounds
vstmia r9, {q10} @ save next counter
sub r6, r8, #.LREVM0SR-.LSR @ pass constants
bl _bsaes_encrypt8_alt
subs r2, r2, #8
blo .Lctr_enc_loop_done
vld1.8 {q8-q9}, [r0]! @ load input
vld1.8 {q10-q11}, [r0]!
veor q0, q8
veor q1, q9
vld1.8 {q12-q13}, [r0]!
veor q4, q10
veor q6, q11
vld1.8 {q14-q15}, [r0]!
veor q3, q12
vst1.8 {q0-q1}, [r1]! @ write output
veor q7, q13
veor q2, q14
vst1.8 {q4}, [r1]!
veor q5, q15
vst1.8 {q6}, [r1]!
vmov.i32 q8, #1 @ compose 1<<96
vst1.8 {q3}, [r1]!
veor q9, q9, q9
vst1.8 {q7}, [r1]!
vext.8 q8, q9, q8, #4
vst1.8 {q2}, [r1]!
vadd.u32 q9,q8,q8 @ compose 2<<96
vst1.8 {q5}, [r1]!
vldmia r9, {q0} @ load counter
bne .Lctr_enc_loop
b .Lctr_enc_done
.align 4
.Lctr_enc_loop_done:
add r2, r2, #8
vld1.8 {q8}, [r0]! @ load input
veor q0, q8
vst1.8 {q0}, [r1]! @ write output
cmp r2, #2
blo .Lctr_enc_done
vld1.8 {q9}, [r0]!
veor q1, q9
vst1.8 {q1}, [r1]!
beq .Lctr_enc_done
vld1.8 {q10}, [r0]!
veor q4, q10
vst1.8 {q4}, [r1]!
cmp r2, #4
blo .Lctr_enc_done
vld1.8 {q11}, [r0]!
veor q6, q11
vst1.8 {q6}, [r1]!
beq .Lctr_enc_done
vld1.8 {q12}, [r0]!
veor q3, q12
vst1.8 {q3}, [r1]!
cmp r2, #6
blo .Lctr_enc_done
vld1.8 {q13}, [r0]!
veor q7, q13
vst1.8 {q7}, [r1]!
beq .Lctr_enc_done
vld1.8 {q14}, [r0]
veor q2, q14
vst1.8 {q2}, [r1]!
.Lctr_enc_done:
vmov.i32 q0, #0
vmov.i32 q1, #0
#ifndef BSAES_ASM_EXTENDED_KEY
.Lctr_enc_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
cmp sp, r9
bne .Lctr_enc_bzero
#else
vstmia sp, {q0-q1}
#endif
mov sp, r9
add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.align 4
.Lctr_enc_short:
ldr ip, [sp] @ ctr pointer is passed on stack
stmdb sp!, {r4-r8, lr}
mov r4, r0 @ copy arguments
mov r5, r1
mov r6, r2
mov r7, r3
ldr r8, [ip, #12] @ load counter LSW
vld1.8 {q1}, [ip] @ load whole counter value
#ifdef __ARMEL__
rev r8, r8
#endif
sub sp, sp, #0x10
vst1.8 {q1}, [sp,:64] @ copy counter value
sub sp, sp, #0x10
.Lctr_enc_short_loop:
add r0, sp, #0x10 @ input counter value
mov r1, sp @ output on the stack
mov r2, r7 @ key
bl AES_encrypt
vld1.8 {q0}, [r4]! @ load input
vld1.8 {q1}, [sp,:64] @ load encrypted counter
add r8, r8, #1
#ifdef __ARMEL__
rev r0, r8
str r0, [sp, #0x1c] @ next counter value
#else
str r8, [sp, #0x1c] @ next counter value
#endif
veor q0,q0,q1
vst1.8 {q0}, [r5]! @ store output
subs r6, r6, #1
bne .Lctr_enc_short_loop
vmov.i32 q0, #0
vmov.i32 q1, #0
vstmia sp!, {q0-q1}
ldmia sp!, {r4-r8, pc}
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
.globl bsaes_xts_encrypt
.type bsaes_xts_encrypt,%function
.align 4
bsaes_xts_encrypt:
mov ip, sp
stmdb sp!, {r4-r10, lr} @ 0x20
VFP_ABI_PUSH
mov r6, sp @ future r3
mov r7, r0
mov r8, r1
mov r9, r2
mov r10, r3
sub r0, sp, #0x10 @ 0x10
bic r0, #0xf @ align at 16 bytes
mov sp, r0
#ifdef XTS_CHAIN_TWEAK
ldr r0, [ip] @ pointer to input tweak
#else
@ generate initial tweak
ldr r0, [ip, #4] @ iv[]
mov r1, sp
ldr r2, [ip, #0] @ key2
bl AES_encrypt
mov r0,sp @ pointer to initial tweak
#endif
ldr r1, [r10, #240] @ get # of rounds
mov r3, r6
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
@ add r12, #96 @ size of bit-sliced key schedule
sub r12, #48 @ place for tweak[9]
@ populate the key schedule
mov r4, r10 @ pass key
mov r5, r1 @ pass # of rounds
mov sp, r12
add r12, #0x90 @ pass key schedule
bl _bsaes_key_convert
veor q7, q7, q15 @ fix up last round key
vstmia r12, {q7} @ save last round key
#else
ldr r12, [r10, #244]
eors r12, #1
beq 0f
str r12, [r10, #244]
mov r4, r10 @ pass key
mov r5, r1 @ pass # of rounds
add r12, r10, #248 @ pass key schedule
bl _bsaes_key_convert
veor q7, q7, q15 @ fix up last round key
vstmia r12, {q7}
.align 2
0: sub sp, #0x90 @ place for tweak[9]
#endif
vld1.8 {q8}, [r0] @ initial tweak
adr r2, .Lxts_magic
subs r9, #0x80
blo .Lxts_enc_short
b .Lxts_enc_loop
.align 4
.Lxts_enc_loop:
vldmia r2, {q5} @ load XTS magic
vshr.s64 q6, q8, #63
mov r0, sp
vand q6, q6, q5
vadd.u64 q9, q8, q8
vst1.64 {q8}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q9, #63
veor q9, q9, q6
vand q7, q7, q5
vadd.u64 q10, q9, q9
vst1.64 {q9}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q10, #63
veor q10, q10, q7
vand q6, q6, q5
vld1.8 {q0}, [r7]!
vadd.u64 q11, q10, q10
vst1.64 {q10}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q11, #63
veor q11, q11, q6
vand q7, q7, q5
vld1.8 {q1}, [r7]!
veor q0, q0, q8
vadd.u64 q12, q11, q11
vst1.64 {q11}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q12, #63
veor q12, q12, q7
vand q6, q6, q5
vld1.8 {q2}, [r7]!
veor q1, q1, q9
vadd.u64 q13, q12, q12
vst1.64 {q12}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q13, #63
veor q13, q13, q6
vand q7, q7, q5
vld1.8 {q3}, [r7]!
veor q2, q2, q10
vadd.u64 q14, q13, q13
vst1.64 {q13}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q14, #63
veor q14, q14, q7
vand q6, q6, q5
vld1.8 {q4}, [r7]!
veor q3, q3, q11
vadd.u64 q15, q14, q14
vst1.64 {q14}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q15, #63
veor q15, q15, q6
vand q7, q7, q5
vld1.8 {q5}, [r7]!
veor q4, q4, q12
vadd.u64 q8, q15, q15
vst1.64 {q15}, [r0,:128]!
vswp d15,d14
veor q8, q8, q7
vst1.64 {q8}, [r0,:128] @ next round tweak
vld1.8 {q6-q7}, [r7]!
veor q5, q5, q13
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q6, q6, q14
mov r5, r1 @ pass rounds
veor q7, q7, q15
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12-q13}, [r0,:128]!
veor q1, q1, q9
veor q8, q4, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q6, q11
vld1.64 {q14-q15}, [r0,:128]!
veor q10, q3, q12
vst1.8 {q8-q9}, [r8]!
veor q11, q7, q13
veor q12, q2, q14
vst1.8 {q10-q11}, [r8]!
veor q13, q5, q15
vst1.8 {q12-q13}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
subs r9, #0x80
bpl .Lxts_enc_loop
.Lxts_enc_short:
adds r9, #0x70
bmi .Lxts_enc_done
vldmia r2, {q5} @ load XTS magic
vshr.s64 q7, q8, #63
mov r0, sp
vand q7, q7, q5
vadd.u64 q9, q8, q8
vst1.64 {q8}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q9, #63
veor q9, q9, q7
vand q6, q6, q5
vadd.u64 q10, q9, q9
vst1.64 {q9}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q10, #63
veor q10, q10, q6
vand q7, q7, q5
vld1.8 {q0}, [r7]!
subs r9, #0x10
bmi .Lxts_enc_1
vadd.u64 q11, q10, q10
vst1.64 {q10}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q11, #63
veor q11, q11, q7
vand q6, q6, q5
vld1.8 {q1}, [r7]!
subs r9, #0x10
bmi .Lxts_enc_2
veor q0, q0, q8
vadd.u64 q12, q11, q11
vst1.64 {q11}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q12, #63
veor q12, q12, q6
vand q7, q7, q5
vld1.8 {q2}, [r7]!
subs r9, #0x10
bmi .Lxts_enc_3
veor q1, q1, q9
vadd.u64 q13, q12, q12
vst1.64 {q12}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q13, #63
veor q13, q13, q7
vand q6, q6, q5
vld1.8 {q3}, [r7]!
subs r9, #0x10
bmi .Lxts_enc_4
veor q2, q2, q10
vadd.u64 q14, q13, q13
vst1.64 {q13}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q14, #63
veor q14, q14, q6
vand q7, q7, q5
vld1.8 {q4}, [r7]!
subs r9, #0x10
bmi .Lxts_enc_5
veor q3, q3, q11
vadd.u64 q15, q14, q14
vst1.64 {q14}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q15, #63
veor q15, q15, q7
vand q6, q6, q5
vld1.8 {q5}, [r7]!
subs r9, #0x10
bmi .Lxts_enc_6
veor q4, q4, q12
sub r9, #0x10
vst1.64 {q15}, [r0,:128] @ next round tweak
vld1.8 {q6}, [r7]!
veor q5, q5, q13
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q6, q6, q14
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12-q13}, [r0,:128]!
veor q1, q1, q9
veor q8, q4, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q6, q11
vld1.64 {q14}, [r0,:128]!
veor q10, q3, q12
vst1.8 {q8-q9}, [r8]!
veor q11, q7, q13
veor q12, q2, q14
vst1.8 {q10-q11}, [r8]!
vst1.8 {q12}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_6:
vst1.64 {q14}, [r0,:128] @ next round tweak
veor q4, q4, q12
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q5, q5, q13
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12-q13}, [r0,:128]!
veor q1, q1, q9
veor q8, q4, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q6, q11
veor q10, q3, q12
vst1.8 {q8-q9}, [r8]!
veor q11, q7, q13
vst1.8 {q10-q11}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_enc_done
@ put this in range for both ARM and Thumb mode adr instructions
.align 5
.Lxts_magic:
.quad 1, 0x87
.align 5
.Lxts_enc_5:
vst1.64 {q13}, [r0,:128] @ next round tweak
veor q3, q3, q11
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q4, q4, q12
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12}, [r0,:128]!
veor q1, q1, q9
veor q8, q4, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q6, q11
veor q10, q3, q12
vst1.8 {q8-q9}, [r8]!
vst1.8 {q10}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_4:
vst1.64 {q12}, [r0,:128] @ next round tweak
veor q2, q2, q10
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q3, q3, q11
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
veor q1, q1, q9
veor q8, q4, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q6, q11
vst1.8 {q8-q9}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_3:
vst1.64 {q11}, [r0,:128] @ next round tweak
veor q1, q1, q9
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q2, q2, q10
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10}, [r0,:128]!
veor q0, q0, q8
veor q1, q1, q9
veor q8, q4, q10
vst1.8 {q0-q1}, [r8]!
vst1.8 {q8}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_2:
vst1.64 {q10}, [r0,:128] @ next round tweak
veor q0, q0, q8
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q1, q1, q9
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {q8-q9}, [r0,:128]!
veor q0, q0, q8
veor q1, q1, q9
vst1.8 {q0-q1}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_1:
mov r0, sp
veor q0, q8
mov r1, sp
vst1.8 {q0}, [sp,:128]
mov r2, r10
mov r4, r3 @ preserve fp
bl AES_encrypt
vld1.8 {q0}, [sp,:128]
veor q0, q0, q8
vst1.8 {q0}, [r8]!
mov r3, r4
vmov q8, q9 @ next round tweak
.Lxts_enc_done:
#ifndef XTS_CHAIN_TWEAK
adds r9, #0x10
beq .Lxts_enc_ret
sub r6, r8, #0x10
.Lxts_enc_steal:
ldrb r0, [r7], #1
ldrb r1, [r8, #-0x10]
strb r0, [r8, #-0x10]
strb r1, [r8], #1
subs r9, #1
bhi .Lxts_enc_steal
vld1.8 {q0}, [r6]
mov r0, sp
veor q0, q0, q8
mov r1, sp
vst1.8 {q0}, [sp,:128]
mov r2, r10
mov r4, r3 @ preserve fp
bl AES_encrypt
vld1.8 {q0}, [sp,:128]
veor q0, q0, q8
vst1.8 {q0}, [r6]
mov r3, r4
#endif
.Lxts_enc_ret:
bic r0, r3, #0xf
vmov.i32 q0, #0
vmov.i32 q1, #0
#ifdef XTS_CHAIN_TWEAK
ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
#endif
.Lxts_enc_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
cmp sp, r0
bne .Lxts_enc_bzero
mov sp, r3
#ifdef XTS_CHAIN_TWEAK
vst1.8 {q8}, [r1]
#endif
VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
.globl bsaes_xts_decrypt
.type bsaes_xts_decrypt,%function
.align 4
bsaes_xts_decrypt:
mov ip, sp
stmdb sp!, {r4-r10, lr} @ 0x20
VFP_ABI_PUSH
mov r6, sp @ future r3
mov r7, r0
mov r8, r1
mov r9, r2
mov r10, r3
sub r0, sp, #0x10 @ 0x10
bic r0, #0xf @ align at 16 bytes
mov sp, r0
#ifdef XTS_CHAIN_TWEAK
ldr r0, [ip] @ pointer to input tweak
#else
@ generate initial tweak
ldr r0, [ip, #4] @ iv[]
mov r1, sp
ldr r2, [ip, #0] @ key2
bl AES_encrypt
mov r0, sp @ pointer to initial tweak
#endif
ldr r1, [r10, #240] @ get # of rounds
mov r3, r6
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
@ add r12, #96 @ size of bit-sliced key schedule
sub r12, #48 @ place for tweak[9]
@ populate the key schedule
mov r4, r10 @ pass key
mov r5, r1 @ pass # of rounds
mov sp, r12
add r12, #0x90 @ pass key schedule
bl _bsaes_key_convert
add r4, sp, #0x90
vldmia r4, {q6}
vstmia r12, {q15} @ save last round key
veor q7, q7, q6 @ fix up round 0 key
vstmia r4, {q7}
#else
ldr r12, [r10, #244]
eors r12, #1
beq 0f
str r12, [r10, #244]
mov r4, r10 @ pass key
mov r5, r1 @ pass # of rounds
add r12, r10, #248 @ pass key schedule
bl _bsaes_key_convert
add r4, r10, #248
vldmia r4, {q6}
vstmia r12, {q15} @ save last round key
veor q7, q7, q6 @ fix up round 0 key
vstmia r4, {q7}
.align 2
0: sub sp, #0x90 @ place for tweak[9]
#endif
vld1.8 {q8}, [r0] @ initial tweak
adr r2, .Lxts_magic
tst r9, #0xf @ if not multiple of 16
it ne @ Thumb2 thing, sanity check in ARM
subne r9, #0x10 @ subtract another 16 bytes
subs r9, #0x80
blo .Lxts_dec_short
b .Lxts_dec_loop
.align 4
.Lxts_dec_loop:
vldmia r2, {q5} @ load XTS magic
vshr.s64 q6, q8, #63
mov r0, sp
vand q6, q6, q5
vadd.u64 q9, q8, q8
vst1.64 {q8}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q9, #63
veor q9, q9, q6
vand q7, q7, q5
vadd.u64 q10, q9, q9
vst1.64 {q9}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q10, #63
veor q10, q10, q7
vand q6, q6, q5
vld1.8 {q0}, [r7]!
vadd.u64 q11, q10, q10
vst1.64 {q10}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q11, #63
veor q11, q11, q6
vand q7, q7, q5
vld1.8 {q1}, [r7]!
veor q0, q0, q8
vadd.u64 q12, q11, q11
vst1.64 {q11}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q12, #63
veor q12, q12, q7
vand q6, q6, q5
vld1.8 {q2}, [r7]!
veor q1, q1, q9
vadd.u64 q13, q12, q12
vst1.64 {q12}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q13, #63
veor q13, q13, q6
vand q7, q7, q5
vld1.8 {q3}, [r7]!
veor q2, q2, q10
vadd.u64 q14, q13, q13
vst1.64 {q13}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q14, #63
veor q14, q14, q7
vand q6, q6, q5
vld1.8 {q4}, [r7]!
veor q3, q3, q11
vadd.u64 q15, q14, q14
vst1.64 {q14}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q15, #63
veor q15, q15, q6
vand q7, q7, q5
vld1.8 {q5}, [r7]!
veor q4, q4, q12
vadd.u64 q8, q15, q15
vst1.64 {q15}, [r0,:128]!
vswp d15,d14
veor q8, q8, q7
vst1.64 {q8}, [r0,:128] @ next round tweak
vld1.8 {q6-q7}, [r7]!
veor q5, q5, q13
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q6, q6, q14
mov r5, r1 @ pass rounds
veor q7, q7, q15
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12-q13}, [r0,:128]!
veor q1, q1, q9
veor q8, q6, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q4, q11
vld1.64 {q14-q15}, [r0,:128]!
veor q10, q2, q12
vst1.8 {q8-q9}, [r8]!
veor q11, q7, q13
veor q12, q3, q14
vst1.8 {q10-q11}, [r8]!
veor q13, q5, q15
vst1.8 {q12-q13}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
subs r9, #0x80
bpl .Lxts_dec_loop
.Lxts_dec_short:
adds r9, #0x70
bmi .Lxts_dec_done
vldmia r2, {q5} @ load XTS magic
vshr.s64 q7, q8, #63
mov r0, sp
vand q7, q7, q5
vadd.u64 q9, q8, q8
vst1.64 {q8}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q9, #63
veor q9, q9, q7
vand q6, q6, q5
vadd.u64 q10, q9, q9
vst1.64 {q9}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q10, #63
veor q10, q10, q6
vand q7, q7, q5
vld1.8 {q0}, [r7]!
subs r9, #0x10
bmi .Lxts_dec_1
vadd.u64 q11, q10, q10
vst1.64 {q10}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q11, #63
veor q11, q11, q7
vand q6, q6, q5
vld1.8 {q1}, [r7]!
subs r9, #0x10
bmi .Lxts_dec_2
veor q0, q0, q8
vadd.u64 q12, q11, q11
vst1.64 {q11}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q12, #63
veor q12, q12, q6
vand q7, q7, q5
vld1.8 {q2}, [r7]!
subs r9, #0x10
bmi .Lxts_dec_3
veor q1, q1, q9
vadd.u64 q13, q12, q12
vst1.64 {q12}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q13, #63
veor q13, q13, q7
vand q6, q6, q5
vld1.8 {q3}, [r7]!
subs r9, #0x10
bmi .Lxts_dec_4
veor q2, q2, q10
vadd.u64 q14, q13, q13
vst1.64 {q13}, [r0,:128]!
vswp d13,d12
vshr.s64 q7, q14, #63
veor q14, q14, q6
vand q7, q7, q5
vld1.8 {q4}, [r7]!
subs r9, #0x10
bmi .Lxts_dec_5
veor q3, q3, q11
vadd.u64 q15, q14, q14
vst1.64 {q14}, [r0,:128]!
vswp d15,d14
vshr.s64 q6, q15, #63
veor q15, q15, q7
vand q6, q6, q5
vld1.8 {q5}, [r7]!
subs r9, #0x10
bmi .Lxts_dec_6
veor q4, q4, q12
sub r9, #0x10
vst1.64 {q15}, [r0,:128] @ next round tweak
vld1.8 {q6}, [r7]!
veor q5, q5, q13
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q6, q6, q14
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12-q13}, [r0,:128]!
veor q1, q1, q9
veor q8, q6, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q4, q11
vld1.64 {q14}, [r0,:128]!
veor q10, q2, q12
vst1.8 {q8-q9}, [r8]!
veor q11, q7, q13
veor q12, q3, q14
vst1.8 {q10-q11}, [r8]!
vst1.8 {q12}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_6:
vst1.64 {q14}, [r0,:128] @ next round tweak
veor q4, q4, q12
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q5, q5, q13
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12-q13}, [r0,:128]!
veor q1, q1, q9
veor q8, q6, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q4, q11
veor q10, q2, q12
vst1.8 {q8-q9}, [r8]!
veor q11, q7, q13
vst1.8 {q10-q11}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_5:
vst1.64 {q13}, [r0,:128] @ next round tweak
veor q3, q3, q11
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q4, q4, q12
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
vld1.64 {q12}, [r0,:128]!
veor q1, q1, q9
veor q8, q6, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q4, q11
veor q10, q2, q12
vst1.8 {q8-q9}, [r8]!
vst1.8 {q10}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_4:
vst1.64 {q12}, [r0,:128] @ next round tweak
veor q2, q2, q10
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q3, q3, q11
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10-q11}, [r0,:128]!
veor q0, q0, q8
veor q1, q1, q9
veor q8, q6, q10
vst1.8 {q0-q1}, [r8]!
veor q9, q4, q11
vst1.8 {q8-q9}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_3:
vst1.64 {q11}, [r0,:128] @ next round tweak
veor q1, q1, q9
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q2, q2, q10
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {q8-q9}, [r0,:128]!
vld1.64 {q10}, [r0,:128]!
veor q0, q0, q8
veor q1, q1, q9
veor q8, q6, q10
vst1.8 {q0-q1}, [r8]!
vst1.8 {q8}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_2:
vst1.64 {q10}, [r0,:128] @ next round tweak
veor q0, q0, q8
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, r10, #248 @ pass key schedule
#endif
veor q1, q1, q9
mov r5, r1 @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {q8-q9}, [r0,:128]!
veor q0, q0, q8
veor q1, q1, q9
vst1.8 {q0-q1}, [r8]!
vld1.64 {q8}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_1:
mov r0, sp
veor q0, q8
mov r1, sp
vst1.8 {q0}, [sp,:128]
mov r2, r10
mov r4, r3 @ preserve fp
mov r5, r2 @ preserve magic
bl AES_decrypt
vld1.8 {q0}, [sp,:128]
veor q0, q0, q8
vst1.8 {q0}, [r8]!
mov r3, r4
mov r2, r5
vmov q8, q9 @ next round tweak
.Lxts_dec_done:
#ifndef XTS_CHAIN_TWEAK
adds r9, #0x10
beq .Lxts_dec_ret
@ calculate one round of extra tweak for the stolen ciphertext
vldmia r2, {q5}
vshr.s64 q6, q8, #63
vand q6, q6, q5
vadd.u64 q9, q8, q8
vswp d13,d12
veor q9, q9, q6
@ perform the final decryption with the last tweak value
vld1.8 {q0}, [r7]!
mov r0, sp
veor q0, q0, q9
mov r1, sp
vst1.8 {q0}, [sp,:128]
mov r2, r10
mov r4, r3 @ preserve fp
bl AES_decrypt
vld1.8 {q0}, [sp,:128]
veor q0, q0, q9
vst1.8 {q0}, [r8]
mov r6, r8
.Lxts_dec_steal:
ldrb r1, [r8]
ldrb r0, [r7], #1
strb r1, [r8, #0x10]
strb r0, [r8], #1
subs r9, #1
bhi .Lxts_dec_steal
vld1.8 {q0}, [r6]
mov r0, sp
veor q0, q8
mov r1, sp
vst1.8 {q0}, [sp,:128]
mov r2, r10
bl AES_decrypt
vld1.8 {q0}, [sp,:128]
veor q0, q0, q8
vst1.8 {q0}, [r6]
mov r3, r4
#endif
.Lxts_dec_ret:
bic r0, r3, #0xf
vmov.i32 q0, #0
vmov.i32 q1, #0
#ifdef XTS_CHAIN_TWEAK
ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
#endif
.Lxts_dec_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
cmp sp, r0
bne .Lxts_dec_bzero
mov sp, r3
#ifdef XTS_CHAIN_TWEAK
vst1.8 {q8}, [r1]
#endif
VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
#endif
/*
* linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <crypto/aes.h>
#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <linux/module.h>
#include "aes_glue.h"
#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
struct BS_KEY {
struct AES_KEY rk;
int converted;
u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
} __aligned(8);
asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 iv[]);
asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
struct BS_KEY *key, u8 const iv[]);
asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 tweak[]);
asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 tweak[]);
struct aesbs_cbc_ctx {
struct AES_KEY enc;
struct BS_KEY dec;
};
struct aesbs_ctr_ctx {
struct BS_KEY enc;
};
struct aesbs_xts_ctx {
struct BS_KEY enc;
struct BS_KEY dec;
struct AES_KEY twkey;
};
static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 8;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->dec.rk = ctx->enc;
private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
ctx->dec.converted = 0;
return 0;
}
static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 8;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->enc.converted = 0;
return 0;
}
static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 4;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->dec.rk = ctx->enc.rk;
private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
ctx->enc.converted = ctx->dec.converted = 0;
return 0;
}
static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
while (walk.nbytes) {
u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *src = walk.src.virt.addr;
if (walk.dst.virt.addr == walk.src.virt.addr) {
u8 *iv = walk.iv;
do {
crypto_xor(src, iv, AES_BLOCK_SIZE);
AES_encrypt(src, src, &ctx->enc);
iv = src;
src += AES_BLOCK_SIZE;
} while (--blocks);
memcpy(walk.iv, iv, AES_BLOCK_SIZE);
} else {
u8 *dst = walk.dst.virt.addr;
do {
crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
AES_encrypt(walk.iv, dst, &ctx->enc);
memcpy(walk.iv, dst, AES_BLOCK_SIZE);
src += AES_BLOCK_SIZE;
dst += AES_BLOCK_SIZE;
} while (--blocks);
}
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
kernel_neon_begin();
bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->dec, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
while (walk.nbytes) {
u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *dst = walk.dst.virt.addr;
u8 *src = walk.src.virt.addr;
u8 bk[2][AES_BLOCK_SIZE];
u8 *iv = walk.iv;
do {
if (walk.dst.virt.addr == walk.src.virt.addr)
memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
AES_decrypt(src, dst, &ctx->dec.rk);
crypto_xor(dst, iv, AES_BLOCK_SIZE);
if (walk.dst.virt.addr == walk.src.virt.addr)
iv = bk[blocks & 1];
else
iv = src;
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
} while (--blocks);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static void inc_be128_ctr(__be32 ctr[], u32 addend)
{
int i;
for (i = 3; i >= 0; i--, addend = 1) {
u32 n = be32_to_cpu(ctr[i]) + addend;
ctr[i] = cpu_to_be32(n);
if (n >= addend)
break;
}
}
static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
u32 blocks;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
u32 tail = walk.nbytes % AES_BLOCK_SIZE;
__be32 *ctr = (__be32 *)walk.iv;
u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
/* avoid 32 bit counter overflow in the NEON code */
if (unlikely(headroom < blocks)) {
blocks = headroom + 1;
tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
}
kernel_neon_begin();
bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
walk.dst.virt.addr, blocks,
&ctx->enc, walk.iv);
kernel_neon_end();
inc_be128_ctr(ctr, blocks);
nbytes -= blocks * AES_BLOCK_SIZE;
if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
break;
err = blkcipher_walk_done(desc, &walk, tail);
}
if (walk.nbytes) {
u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
u8 ks[AES_BLOCK_SIZE];
AES_encrypt(walk.iv, ks, &ctx->enc.rk);
if (tdst != tsrc)
memcpy(tdst, tsrc, nbytes);
crypto_xor(tdst, ks, nbytes);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
/* generate the initial tweak */
AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
while (walk.nbytes) {
kernel_neon_begin();
bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->enc, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
/* generate the initial tweak */
AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
while (walk.nbytes) {
kernel_neon_begin();
bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->dec, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static struct crypto_alg aesbs_algs[] = { {
.cra_name = "__cbc-aes-neonbs",
.cra_driver_name = "__driver-cbc-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_cbc_set_key,
.encrypt = aesbs_cbc_encrypt,
.decrypt = aesbs_cbc_decrypt,
},
}, {
.cra_name = "__ctr-aes-neonbs",
.cra_driver_name = "__driver-ctr-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_ctr_set_key,
.encrypt = aesbs_ctr_encrypt,
.decrypt = aesbs_ctr_encrypt,
},
}, {
.cra_name = "__xts-aes-neonbs",
.cra_driver_name = "__driver-xts-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_xts_set_key,
.encrypt = aesbs_xts_encrypt,
.decrypt = aesbs_xts_decrypt,
},
}, {
.cra_name = "cbc(aes)",
.cra_driver_name = "cbc-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = __ablk_encrypt,
.decrypt = ablk_decrypt,
}
}, {
.cra_name = "ctr(aes)",
.cra_driver_name = "ctr-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
}
}, {
.cra_name = "xts(aes)",
.cra_driver_name = "xts-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
}
} };
static int __init aesbs_mod_init(void)
{
if (!cpu_has_neon())
return -ENODEV;
return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
}
static void __exit aesbs_mod_exit(void)
{
crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
}
module_init(aesbs_mod_init);
module_exit(aesbs_mod_exit);
MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL");
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
# granted.
# ====================================================================
# Bit-sliced AES for ARM NEON
#
# February 2012.
#
# This implementation is direct adaptation of bsaes-x86_64 module for
# ARM NEON. Except that this module is endian-neutral [in sense that
# it can be compiled for either endianness] by courtesy of vld1.8's
# neutrality. Initial version doesn't implement interface to OpenSSL,
# only low-level primitives and unsupported entry points, just enough
# to collect performance results, which for Cortex-A8 core are:
#
# encrypt 19.5 cycles per byte processed with 128-bit key
# decrypt 22.1 cycles per byte processed with 128-bit key
# key conv. 440 cycles per 128-bit key/0.18 of 8x block
#
# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
# which is [much] worse than anticipated (for further details see
# http://www.openssl.org/~appro/Snapdragon-S4.html).
#
# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
# manages in 20.0 cycles].
#
# When comparing to x86_64 results keep in mind that NEON unit is
# [mostly] single-issue and thus can't [fully] benefit from
# instruction-level parallelism. And when comparing to aes-armv4
# results keep in mind key schedule conversion overhead (see
# bsaes-x86_64.pl for further details)...
#
# <appro@openssl.org>
# April-August 2013
#
# Add CBC, CTR and XTS subroutines, adapt for kernel use.
#
# <ard.biesheuvel@linaro.org>
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
my @XMM=map("q$_",(0..15));
{
my ($key,$rounds,$const)=("r4","r5","r6");
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub Sbox {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
my @b=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
&InBasisChange (@b);
&Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
&OutBasisChange (@b[7,1,4,2,6,5,0,3]);
}
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[2], @b[2], @b[1]
veor @b[5], @b[5], @b[6]
veor @b[3], @b[3], @b[0]
veor @b[6], @b[6], @b[2]
veor @b[5], @b[5], @b[0]
veor @b[6], @b[6], @b[3]
veor @b[3], @b[3], @b[7]
veor @b[7], @b[7], @b[5]
veor @b[3], @b[3], @b[4]
veor @b[4], @b[4], @b[5]
veor @b[2], @b[2], @b[7]
veor @b[3], @b[3], @b[1]
veor @b[1], @b[1], @b[5]
___
}
sub OutBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[0], @b[0], @b[6]
veor @b[1], @b[1], @b[4]
veor @b[4], @b[4], @b[6]
veor @b[2], @b[2], @b[0]
veor @b[6], @b[6], @b[1]
veor @b[1], @b[1], @b[5]
veor @b[5], @b[5], @b[3]
veor @b[3], @b[3], @b[7]
veor @b[7], @b[7], @b[5]
veor @b[2], @b[2], @b[5]
veor @b[4], @b[4], @b[7]
___
}
sub InvSbox {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
my @b=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
&InvInBasisChange (@b);
&Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
&InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
}
sub InvInBasisChange { # OutBasisChange in reverse (with twist)
my @b=@_[5,1,2,6,3,7,0,4];
$code.=<<___
veor @b[1], @b[1], @b[7]
veor @b[4], @b[4], @b[7]
veor @b[7], @b[7], @b[5]
veor @b[1], @b[1], @b[3]
veor @b[2], @b[2], @b[5]
veor @b[3], @b[3], @b[7]
veor @b[6], @b[6], @b[1]
veor @b[2], @b[2], @b[0]
veor @b[5], @b[5], @b[3]
veor @b[4], @b[4], @b[6]
veor @b[0], @b[0], @b[6]
veor @b[1], @b[1], @b[4]
___
}
sub InvOutBasisChange { # InBasisChange in reverse
my @b=@_[2,5,7,3,6,1,0,4];
$code.=<<___;
veor @b[1], @b[1], @b[5]
veor @b[2], @b[2], @b[7]
veor @b[3], @b[3], @b[1]
veor @b[4], @b[4], @b[5]
veor @b[7], @b[7], @b[5]
veor @b[3], @b[3], @b[4]
veor @b[5], @b[5], @b[0]
veor @b[3], @b[3], @b[7]
veor @b[6], @b[6], @b[2]
veor @b[2], @b[2], @b[1]
veor @b[6], @b[6], @b[3]
veor @b[3], @b[3], @b[0]
veor @b[5], @b[5], @b[6]
___
}
sub Mul_GF4 {
#;*************************************************************
#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
#;*************************************************************
my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
$code.=<<___;
veor $t0, $y0, $y1
vand $t0, $t0, $x0
veor $x0, $x0, $x1
vand $t1, $x1, $y0
vand $x0, $x0, $y1
veor $x1, $t1, $t0
veor $x0, $x0, $t1
___
}
sub Mul_GF4_N { # not used, see next subroutine
# multiply and scale by N
my ($x0,$x1,$y0,$y1,$t0)=@_;
$code.=<<___;
veor $t0, $y0, $y1
vand $t0, $t0, $x0
veor $x0, $x0, $x1
vand $x1, $x1, $y0
vand $x0, $x0, $y1
veor $x1, $x1, $x0
veor $x0, $x0, $t0
___
}
sub Mul_GF4_N_GF4 {
# interleaved Mul_GF4_N and Mul_GF4
my ($x0,$x1,$y0,$y1,$t0,
$x2,$x3,$y2,$y3,$t1)=@_;
$code.=<<___;
veor $t0, $y0, $y1
veor $t1, $y2, $y3
vand $t0, $t0, $x0
vand $t1, $t1, $x2
veor $x0, $x0, $x1
veor $x2, $x2, $x3
vand $x1, $x1, $y0
vand $x3, $x3, $y2
vand $x0, $x0, $y1
vand $x2, $x2, $y3
veor $x1, $x1, $x0
veor $x2, $x2, $x3
veor $x0, $x0, $t0
veor $x3, $x3, $t1
___
}
sub Mul_GF16_2 {
my @x=@_[0..7];
my @y=@_[8..11];
my @t=@_[12..15];
$code.=<<___;
veor @t[0], @x[0], @x[2]
veor @t[1], @x[1], @x[3]
___
&Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
$code.=<<___;
veor @y[0], @y[0], @y[2]
veor @y[1], @y[1], @y[3]
___
Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
@x[2], @x[3], @y[2], @y[3], @t[2]);
$code.=<<___;
veor @x[0], @x[0], @t[0]
veor @x[2], @x[2], @t[0]
veor @x[1], @x[1], @t[1]
veor @x[3], @x[3], @t[1]
veor @t[0], @x[4], @x[6]
veor @t[1], @x[5], @x[7]
___
&Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
@x[6], @x[7], @y[2], @y[3], @t[2]);
$code.=<<___;
veor @y[0], @y[0], @y[2]
veor @y[1], @y[1], @y[3]
___
&Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
$code.=<<___;
veor @x[4], @x[4], @t[0]
veor @x[6], @x[6], @t[0]
veor @x[5], @x[5], @t[1]
veor @x[7], @x[7], @t[1]
___
}
sub Inv_GF256 {
#;********************************************************************
#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
#;********************************************************************
my @x=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
# direct optimizations from hardware
$code.=<<___;
veor @t[3], @x[4], @x[6]
veor @t[2], @x[5], @x[7]
veor @t[1], @x[1], @x[3]
veor @s[1], @x[7], @x[6]
vmov @t[0], @t[2]
veor @s[0], @x[0], @x[2]
vorr @t[2], @t[2], @t[1]
veor @s[3], @t[3], @t[0]
vand @s[2], @t[3], @s[0]
vorr @t[3], @t[3], @s[0]
veor @s[0], @s[0], @t[1]
vand @t[0], @t[0], @t[1]
veor @t[1], @x[3], @x[2]
vand @s[3], @s[3], @s[0]
vand @s[1], @s[1], @t[1]
veor @t[1], @x[4], @x[5]
veor @s[0], @x[1], @x[0]
veor @t[3], @t[3], @s[1]
veor @t[2], @t[2], @s[1]
vand @s[1], @t[1], @s[0]
vorr @t[1], @t[1], @s[0]
veor @t[3], @t[3], @s[3]
veor @t[0], @t[0], @s[1]
veor @t[2], @t[2], @s[2]
veor @t[1], @t[1], @s[3]
veor @t[0], @t[0], @s[2]
vand @s[0], @x[7], @x[3]
veor @t[1], @t[1], @s[2]
vand @s[1], @x[6], @x[2]
vand @s[2], @x[5], @x[1]
vorr @s[3], @x[4], @x[0]
veor @t[3], @t[3], @s[0]
veor @t[1], @t[1], @s[2]
veor @t[0], @t[0], @s[3]
veor @t[2], @t[2], @s[1]
@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
@ new smaller inversion
vand @s[2], @t[3], @t[1]
vmov @s[0], @t[0]
veor @s[1], @t[2], @s[2]
veor @s[3], @t[0], @s[2]
veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
vbsl @s[1], @t[1], @t[0]
vbsl @s[3], @t[3], @t[2]
veor @t[3], @t[3], @t[2]
vbsl @s[0], @s[1], @s[2]
vbsl @t[0], @s[2], @s[1]
vand @s[2], @s[0], @s[3]
veor @t[1], @t[1], @t[0]
veor @s[2], @s[2], @t[3]
___
# output in s3, s2, s1, t1
# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
}
# AES linear components
sub ShiftRows {
my @x=@_[0..7];
my @t=@_[8..11];
my $mask=pop;
$code.=<<___;
vldmia $key!, {@t[0]-@t[3]}
veor @t[0], @t[0], @x[0]
veor @t[1], @t[1], @x[1]
vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
vldmia $key!, {@t[0]}
veor @t[2], @t[2], @x[2]
vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
vldmia $key!, {@t[1]}
veor @t[3], @t[3], @x[3]
vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
vldmia $key!, {@t[2]}
vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
vldmia $key!, {@t[3]}
veor @t[0], @t[0], @x[4]
veor @t[1], @t[1], @x[5]
vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
veor @t[2], @t[2], @x[6]
vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
veor @t[3], @t[3], @x[7]
vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
___
}
sub MixColumns {
# modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7];
my @t=@_[8..15];
my $inv=@_[16]; # optional
$code.=<<___;
vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
vext.8 @t[1], @x[1], @x[1], #12
veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
vext.8 @t[2], @x[2], @x[2], #12
veor @x[1], @x[1], @t[1]
vext.8 @t[3], @x[3], @x[3], #12
veor @x[2], @x[2], @t[2]
vext.8 @t[4], @x[4], @x[4], #12
veor @x[3], @x[3], @t[3]
vext.8 @t[5], @x[5], @x[5], #12
veor @x[4], @x[4], @t[4]
vext.8 @t[6], @x[6], @x[6], #12
veor @x[5], @x[5], @t[5]
vext.8 @t[7], @x[7], @x[7], #12
veor @x[6], @x[6], @t[6]
veor @t[1], @t[1], @x[0]
veor @x[7], @x[7], @t[7]
vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
veor @t[2], @t[2], @x[1]
veor @t[0], @t[0], @x[7]
veor @t[1], @t[1], @x[7]
vext.8 @x[1], @x[1], @x[1], #8
veor @t[5], @t[5], @x[4]
veor @x[0], @x[0], @t[0]
veor @t[6], @t[6], @x[5]
veor @x[1], @x[1], @t[1]
vext.8 @t[0], @x[4], @x[4], #8
veor @t[4], @t[4], @x[3]
vext.8 @t[1], @x[5], @x[5], #8
veor @t[7], @t[7], @x[6]
vext.8 @x[4], @x[3], @x[3], #8
veor @t[3], @t[3], @x[2]
vext.8 @x[5], @x[7], @x[7], #8
veor @t[4], @t[4], @x[7]
vext.8 @x[3], @x[6], @x[6], #8
veor @t[3], @t[3], @x[7]
vext.8 @x[6], @x[2], @x[2], #8
veor @x[7], @t[1], @t[5]
___
$code.=<<___ if (!$inv);
veor @x[2], @t[0], @t[4]
veor @x[4], @x[4], @t[3]
veor @x[5], @x[5], @t[7]
veor @x[3], @x[3], @t[6]
@ vmov @x[2], @t[0]
veor @x[6], @x[6], @t[2]
@ vmov @x[7], @t[1]
___
$code.=<<___ if ($inv);
veor @t[3], @t[3], @x[4]
veor @x[5], @x[5], @t[7]
veor @x[2], @x[3], @t[6]
veor @x[3], @t[0], @t[4]
veor @x[4], @x[6], @t[2]
vmov @x[6], @t[3]
@ vmov @x[7], @t[1]
___
}
sub InvMixColumns_orig {
my @x=@_[0..7];
my @t=@_[8..15];
$code.=<<___;
@ multiplication by 0x0e
vext.8 @t[7], @x[7], @x[7], #12
vmov @t[2], @x[2]
veor @x[2], @x[2], @x[5] @ 2 5
veor @x[7], @x[7], @x[5] @ 7 5
vext.8 @t[0], @x[0], @x[0], #12
vmov @t[5], @x[5]
veor @x[5], @x[5], @x[0] @ 5 0 [1]
veor @x[0], @x[0], @x[1] @ 0 1
vext.8 @t[1], @x[1], @x[1], #12
veor @x[1], @x[1], @x[2] @ 1 25
veor @x[0], @x[0], @x[6] @ 01 6 [2]
vext.8 @t[3], @x[3], @x[3], #12
veor @x[1], @x[1], @x[3] @ 125 3 [4]
veor @x[2], @x[2], @x[0] @ 25 016 [3]
veor @x[3], @x[3], @x[7] @ 3 75
veor @x[7], @x[7], @x[6] @ 75 6 [0]
vext.8 @t[6], @x[6], @x[6], #12
vmov @t[4], @x[4]
veor @x[6], @x[6], @x[4] @ 6 4
veor @x[4], @x[4], @x[3] @ 4 375 [6]
veor @x[3], @x[3], @x[7] @ 375 756=36
veor @x[6], @x[6], @t[5] @ 64 5 [7]
veor @x[3], @x[3], @t[2] @ 36 2
vext.8 @t[5], @t[5], @t[5], #12
veor @x[3], @x[3], @t[4] @ 362 4 [5]
___
my @y = @x[7,5,0,2,1,3,4,6];
$code.=<<___;
@ multiplication by 0x0b
veor @y[1], @y[1], @y[0]
veor @y[0], @y[0], @t[0]
vext.8 @t[2], @t[2], @t[2], #12
veor @y[1], @y[1], @t[1]
veor @y[0], @y[0], @t[5]
vext.8 @t[4], @t[4], @t[4], #12
veor @y[1], @y[1], @t[6]
veor @y[0], @y[0], @t[7]
veor @t[7], @t[7], @t[6] @ clobber t[7]
veor @y[3], @y[3], @t[0]
veor @y[1], @y[1], @y[0]
vext.8 @t[0], @t[0], @t[0], #12
veor @y[2], @y[2], @t[1]
veor @y[4], @y[4], @t[1]
vext.8 @t[1], @t[1], @t[1], #12
veor @y[2], @y[2], @t[2]
veor @y[3], @y[3], @t[2]
veor @y[5], @y[5], @t[2]
veor @y[2], @y[2], @t[7]
vext.8 @t[2], @t[2], @t[2], #12
veor @y[3], @y[3], @t[3]
veor @y[6], @y[6], @t[3]
veor @y[4], @y[4], @t[3]
veor @y[7], @y[7], @t[4]
vext.8 @t[3], @t[3], @t[3], #12
veor @y[5], @y[5], @t[4]
veor @y[7], @y[7], @t[7]
veor @t[7], @t[7], @t[5] @ clobber t[7] even more
veor @y[3], @y[3], @t[5]
veor @y[4], @y[4], @t[4]
veor @y[5], @y[5], @t[7]
vext.8 @t[4], @t[4], @t[4], #12
veor @y[6], @y[6], @t[7]
veor @y[4], @y[4], @t[7]
veor @t[7], @t[7], @t[5]
vext.8 @t[5], @t[5], @t[5], #12
@ multiplication by 0x0d
veor @y[4], @y[4], @y[7]
veor @t[7], @t[7], @t[6] @ restore t[7]
veor @y[7], @y[7], @t[4]
vext.8 @t[6], @t[6], @t[6], #12
veor @y[2], @y[2], @t[0]
veor @y[7], @y[7], @t[5]
vext.8 @t[7], @t[7], @t[7], #12
veor @y[2], @y[2], @t[2]
veor @y[3], @y[3], @y[1]
veor @y[1], @y[1], @t[1]
veor @y[0], @y[0], @t[0]
veor @y[3], @y[3], @t[0]
veor @y[1], @y[1], @t[5]
veor @y[0], @y[0], @t[5]
vext.8 @t[0], @t[0], @t[0], #12
veor @y[1], @y[1], @t[7]
veor @y[0], @y[0], @t[6]
veor @y[3], @y[3], @y[1]
veor @y[4], @y[4], @t[1]
vext.8 @t[1], @t[1], @t[1], #12
veor @y[7], @y[7], @t[7]
veor @y[4], @y[4], @t[2]
veor @y[5], @y[5], @t[2]
veor @y[2], @y[2], @t[6]
veor @t[6], @t[6], @t[3] @ clobber t[6]
vext.8 @t[2], @t[2], @t[2], #12
veor @y[4], @y[4], @y[7]
veor @y[3], @y[3], @t[6]
veor @y[6], @y[6], @t[6]
veor @y[5], @y[5], @t[5]
vext.8 @t[5], @t[5], @t[5], #12
veor @y[6], @y[6], @t[4]
vext.8 @t[4], @t[4], @t[4], #12
veor @y[5], @y[5], @t[6]
veor @y[6], @y[6], @t[7]
vext.8 @t[7], @t[7], @t[7], #12
veor @t[6], @t[6], @t[3] @ restore t[6]
vext.8 @t[3], @t[3], @t[3], #12
@ multiplication by 0x09
veor @y[4], @y[4], @y[1]
veor @t[1], @t[1], @y[1] @ t[1]=y[1]
veor @t[0], @t[0], @t[5] @ clobber t[0]
vext.8 @t[6], @t[6], @t[6], #12
veor @t[1], @t[1], @t[5]
veor @y[3], @y[3], @t[0]
veor @t[0], @t[0], @y[0] @ t[0]=y[0]
veor @t[1], @t[1], @t[6]
veor @t[6], @t[6], @t[7] @ clobber t[6]
veor @y[4], @y[4], @t[1]
veor @y[7], @y[7], @t[4]
veor @y[6], @y[6], @t[3]
veor @y[5], @y[5], @t[2]
veor @t[4], @t[4], @y[4] @ t[4]=y[4]
veor @t[3], @t[3], @y[3] @ t[3]=y[3]
veor @t[5], @t[5], @y[5] @ t[5]=y[5]
veor @t[2], @t[2], @y[2] @ t[2]=y[2]
veor @t[3], @t[3], @t[7]
veor @XMM[5], @t[5], @t[6]
veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
veor @XMM[2], @t[2], @t[6]
veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
vmov @XMM[0], @t[0]
vmov @XMM[1], @t[1]
@ vmov @XMM[2], @t[2]
vmov @XMM[3], @t[3]
vmov @XMM[4], @t[4]
@ vmov @XMM[5], @t[5]
@ vmov @XMM[6], @t[6]
@ vmov @XMM[7], @t[7]
___
}
sub InvMixColumns {
my @x=@_[0..7];
my @t=@_[8..15];
# Thanks to Jussi Kivilinna for providing pointer to
#
# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
$code.=<<___;
@ multiplication by 0x05-0x00-0x04-0x00
vext.8 @t[0], @x[0], @x[0], #8
vext.8 @t[6], @x[6], @x[6], #8
vext.8 @t[7], @x[7], @x[7], #8
veor @t[0], @t[0], @x[0]
vext.8 @t[1], @x[1], @x[1], #8
veor @t[6], @t[6], @x[6]
vext.8 @t[2], @x[2], @x[2], #8
veor @t[7], @t[7], @x[7]
vext.8 @t[3], @x[3], @x[3], #8
veor @t[1], @t[1], @x[1]
vext.8 @t[4], @x[4], @x[4], #8
veor @t[2], @t[2], @x[2]
vext.8 @t[5], @x[5], @x[5], #8
veor @t[3], @t[3], @x[3]
veor @t[4], @t[4], @x[4]
veor @t[5], @t[5], @x[5]
veor @x[0], @x[0], @t[6]
veor @x[1], @x[1], @t[6]
veor @x[2], @x[2], @t[0]
veor @x[4], @x[4], @t[2]
veor @x[3], @x[3], @t[1]
veor @x[1], @x[1], @t[7]
veor @x[2], @x[2], @t[7]
veor @x[4], @x[4], @t[6]
veor @x[5], @x[5], @t[3]
veor @x[3], @x[3], @t[6]
veor @x[6], @x[6], @t[4]
veor @x[4], @x[4], @t[7]
veor @x[5], @x[5], @t[7]
veor @x[7], @x[7], @t[5]
___
&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
}
sub swapmove {
my ($a,$b,$n,$mask,$t)=@_;
$code.=<<___;
vshr.u64 $t, $b, #$n
veor $t, $t, $a
vand $t, $t, $mask
veor $a, $a, $t
vshl.u64 $t, $t, #$n
veor $b, $b, $t
___
}
sub swapmove2x {
my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
$code.=<<___;
vshr.u64 $t0, $b0, #$n
vshr.u64 $t1, $b1, #$n
veor $t0, $t0, $a0
veor $t1, $t1, $a1
vand $t0, $t0, $mask
vand $t1, $t1, $mask
veor $a0, $a0, $t0
vshl.u64 $t0, $t0, #$n
veor $a1, $a1, $t1
vshl.u64 $t1, $t1, #$n
veor $b0, $b0, $t0
veor $b1, $b1, $t1
___
}
sub bitslice {
my @x=reverse(@_[0..7]);
my ($t0,$t1,$t2,$t3)=@_[8..11];
$code.=<<___;
vmov.i8 $t0,#0x55 @ compose .LBS0
vmov.i8 $t1,#0x33 @ compose .LBS1
___
&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
$code.=<<___;
vmov.i8 $t0,#0x0f @ compose .LBS2
___
&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
}
$code.=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
# define VFP_ABI_POP vldmia sp!,{d8-d15}
# define VFP_ABI_FRAME 0x40
#else
# define VFP_ABI_PUSH
# define VFP_ABI_POP
# define VFP_ABI_FRAME 0
# define BSAES_ASM_EXTENDED_KEY
# define XTS_CHAIN_TWEAK
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
#endif
#ifdef __thumb__
# define adrl adr
#endif
#if __ARM_ARCH__>=7
.text
.syntax unified @ ARMv7-capable assembler is expected to handle this
#ifdef __thumb2__
.thumb
#else
.code 32
#endif
.fpu neon
.type _bsaes_decrypt8,%function
.align 4
_bsaes_decrypt8:
adr $const,_bsaes_decrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
add $const,$const,#.LM0ISR-_bsaes_decrypt8
vldmia $const!, {@XMM[8]} @ .LM0ISR
veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
veor @XMM[11], @XMM[1], @XMM[9]
vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
veor @XMM[12], @XMM[2], @XMM[9]
vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
veor @XMM[13], @XMM[3], @XMM[9]
vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
veor @XMM[14], @XMM[4], @XMM[9]
vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
veor @XMM[15], @XMM[5], @XMM[9]
vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
veor @XMM[10], @XMM[6], @XMM[9]
vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
veor @XMM[11], @XMM[7], @XMM[9]
vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
___
&bitslice (@XMM[0..7, 8..11]);
$code.=<<___;
sub $rounds,$rounds,#1
b .Ldec_sbox
.align 4
.Ldec_loop:
___
&ShiftRows (@XMM[0..7, 8..12]);
$code.=".Ldec_sbox:\n";
&InvSbox (@XMM[0..7, 8..15]);
$code.=<<___;
subs $rounds,$rounds,#1
bcc .Ldec_done
___
&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
$code.=<<___;
vldmia $const, {@XMM[12]} @ .LISR
ite eq @ Thumb2 thing, sanity check in ARM
addeq $const,$const,#0x10
bne .Ldec_loop
vldmia $const, {@XMM[12]} @ .LISRM0
b .Ldec_loop
.align 4
.Ldec_done:
___
&bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
$code.=<<___;
vldmia $key, {@XMM[8]} @ last round key
veor @XMM[6], @XMM[6], @XMM[8]
veor @XMM[4], @XMM[4], @XMM[8]
veor @XMM[2], @XMM[2], @XMM[8]
veor @XMM[7], @XMM[7], @XMM[8]
veor @XMM[3], @XMM[3], @XMM[8]
veor @XMM[5], @XMM[5], @XMM[8]
veor @XMM[0], @XMM[0], @XMM[8]
veor @XMM[1], @XMM[1], @XMM[8]
bx lr
.size _bsaes_decrypt8,.-_bsaes_decrypt8
.type _bsaes_const,%object
.align 6
_bsaes_const:
.LM0ISR: @ InvShiftRows constants
.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
.LISR:
.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
.LISRM0:
.quad 0x01040b0e0205080f, 0x0306090c00070a0d
.LM0SR: @ ShiftRows constants
.quad 0x0a0e02060f03070b, 0x0004080c05090d01
.LSR:
.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
.LSRM0:
.quad 0x0304090e00050a0f, 0x01060b0c0207080d
.LM0:
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
.LREVM0SR:
.quad 0x090d01050c000408, 0x03070b0f060a0e02
.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 6
.size _bsaes_const,.-_bsaes_const
.type _bsaes_encrypt8,%function
.align 4
_bsaes_encrypt8:
adr $const,_bsaes_encrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
sub $const,$const,#_bsaes_encrypt8-.LM0SR
vldmia $const!, {@XMM[8]} @ .LM0SR
_bsaes_encrypt8_alt:
veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
veor @XMM[11], @XMM[1], @XMM[9]
vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
veor @XMM[12], @XMM[2], @XMM[9]
vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
veor @XMM[13], @XMM[3], @XMM[9]
vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
veor @XMM[14], @XMM[4], @XMM[9]
vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
veor @XMM[15], @XMM[5], @XMM[9]
vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
veor @XMM[10], @XMM[6], @XMM[9]
vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
veor @XMM[11], @XMM[7], @XMM[9]
vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
_bsaes_encrypt8_bitslice:
___
&bitslice (@XMM[0..7, 8..11]);
$code.=<<___;
sub $rounds,$rounds,#1
b .Lenc_sbox
.align 4
.Lenc_loop:
___
&ShiftRows (@XMM[0..7, 8..12]);
$code.=".Lenc_sbox:\n";
&Sbox (@XMM[0..7, 8..15]);
$code.=<<___;
subs $rounds,$rounds,#1
bcc .Lenc_done
___
&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
$code.=<<___;
vldmia $const, {@XMM[12]} @ .LSR
ite eq @ Thumb2 thing, samity check in ARM
addeq $const,$const,#0x10
bne .Lenc_loop
vldmia $const, {@XMM[12]} @ .LSRM0
b .Lenc_loop
.align 4
.Lenc_done:
___
# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
&bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
$code.=<<___;
vldmia $key, {@XMM[8]} @ last round key
veor @XMM[4], @XMM[4], @XMM[8]
veor @XMM[6], @XMM[6], @XMM[8]
veor @XMM[3], @XMM[3], @XMM[8]
veor @XMM[7], @XMM[7], @XMM[8]
veor @XMM[2], @XMM[2], @XMM[8]
veor @XMM[5], @XMM[5], @XMM[8]
veor @XMM[0], @XMM[0], @XMM[8]
veor @XMM[1], @XMM[1], @XMM[8]
bx lr
.size _bsaes_encrypt8,.-_bsaes_encrypt8
___
}
{
my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
sub bitslice_key {
my @x=reverse(@_[0..7]);
my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
&swapmove (@x[0,1],1,$bs0,$t2,$t3);
$code.=<<___;
@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
vmov @x[2], @x[0]
vmov @x[3], @x[1]
___
#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
&swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
$code.=<<___;
@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
vmov @x[4], @x[0]
vmov @x[6], @x[2]
vmov @x[5], @x[1]
vmov @x[7], @x[3]
___
&swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
&swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
}
$code.=<<___;
.type _bsaes_key_convert,%function
.align 4
_bsaes_key_convert:
adr $const,_bsaes_key_convert
vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
sub $const,$const,#_bsaes_key_convert-.LM0
vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
vmov.i8 @XMM[8], #0x01 @ bit masks
vmov.i8 @XMM[9], #0x02
vmov.i8 @XMM[10], #0x04
vmov.i8 @XMM[11], #0x08
vmov.i8 @XMM[12], #0x10
vmov.i8 @XMM[13], #0x20
vldmia $const, {@XMM[14]} @ .LM0
#ifdef __ARMEL__
vrev32.8 @XMM[7], @XMM[7]
vrev32.8 @XMM[15], @XMM[15]
#endif
sub $rounds,$rounds,#1
vstmia $out!, {@XMM[7]} @ save round 0 key
b .Lkey_loop
.align 4
.Lkey_loop:
vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
vmov.i8 @XMM[6], #0x40
vmov.i8 @XMM[15], #0x80
vtst.8 @XMM[0], @XMM[7], @XMM[8]
vtst.8 @XMM[1], @XMM[7], @XMM[9]
vtst.8 @XMM[2], @XMM[7], @XMM[10]
vtst.8 @XMM[3], @XMM[7], @XMM[11]
vtst.8 @XMM[4], @XMM[7], @XMM[12]
vtst.8 @XMM[5], @XMM[7], @XMM[13]
vtst.8 @XMM[6], @XMM[7], @XMM[6]
vtst.8 @XMM[7], @XMM[7], @XMM[15]
vld1.8 {@XMM[15]}, [$inp]! @ load next round key
vmvn @XMM[0], @XMM[0] @ "pnot"
vmvn @XMM[1], @XMM[1]
vmvn @XMM[5], @XMM[5]
vmvn @XMM[6], @XMM[6]
#ifdef __ARMEL__
vrev32.8 @XMM[15], @XMM[15]
#endif
subs $rounds,$rounds,#1
vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
bne .Lkey_loop
vmov.i8 @XMM[7],#0x63 @ compose .L63
@ don't save last round key
bx lr
.size _bsaes_key_convert,.-_bsaes_key_convert
___
}
if (0) { # following four functions are unsupported interface
# used for benchmarking...
$code.=<<___;
.globl bsaes_enc_key_convert
.type bsaes_enc_key_convert,%function
.align 4
bsaes_enc_key_convert:
stmdb sp!,{r4-r6,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r5,[$inp,#240] @ pass rounds
mov r4,$inp @ pass key
mov r12,$out @ pass key schedule
bl _bsaes_key_convert
veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
vstmia r12, {@XMM[7]} @ save last round key
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r6,pc}
.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
.globl bsaes_encrypt_128
.type bsaes_encrypt_128,%function
.align 4
bsaes_encrypt_128:
stmdb sp!,{r4-r6,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
.Lenc128_loop:
vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
mov r4,$key @ pass the key
vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
mov r5,#10 @ pass rounds
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
bl _bsaes_encrypt8
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
vst1.8 {@XMM[4]}, [$out]!
vst1.8 {@XMM[6]}, [$out]!
vst1.8 {@XMM[3]}, [$out]!
vst1.8 {@XMM[7]}, [$out]!
vst1.8 {@XMM[2]}, [$out]!
subs $len,$len,#0x80
vst1.8 {@XMM[5]}, [$out]!
bhi .Lenc128_loop
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r6,pc}
.size bsaes_encrypt_128,.-bsaes_encrypt_128
.globl bsaes_dec_key_convert
.type bsaes_dec_key_convert,%function
.align 4
bsaes_dec_key_convert:
stmdb sp!,{r4-r6,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r5,[$inp,#240] @ pass rounds
mov r4,$inp @ pass key
mov r12,$out @ pass key schedule
bl _bsaes_key_convert
vldmia $out, {@XMM[6]}
vstmia r12, {@XMM[15]} @ save last round key
veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
vstmia $out, {@XMM[7]}
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r6,pc}
.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
.globl bsaes_decrypt_128
.type bsaes_decrypt_128,%function
.align 4
bsaes_decrypt_128:
stmdb sp!,{r4-r6,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
.Ldec128_loop:
vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
mov r4,$key @ pass the key
vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
mov r5,#10 @ pass rounds
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
bl _bsaes_decrypt8
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
vst1.8 {@XMM[6]}, [$out]!
vst1.8 {@XMM[4]}, [$out]!
vst1.8 {@XMM[2]}, [$out]!
vst1.8 {@XMM[7]}, [$out]!
vst1.8 {@XMM[3]}, [$out]!
subs $len,$len,#0x80
vst1.8 {@XMM[5]}, [$out]!
bhi .Ldec128_loop
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r6,pc}
.size bsaes_decrypt_128,.-bsaes_decrypt_128
___
}
{
my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
my ($keysched)=("sp");
$code.=<<___;
.extern AES_cbc_encrypt
.extern AES_decrypt
.global bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,%function
.align 5
bsaes_cbc_encrypt:
#ifndef __KERNEL__
cmp $len, #128
#ifndef __thumb__
blo AES_cbc_encrypt
#else
bhs 1f
b AES_cbc_encrypt
1:
#endif
#endif
@ it is up to the caller to make sure we are called with enc == 0
mov ip, sp
stmdb sp!, {r4-r10, lr}
VFP_ABI_PUSH
ldr $ivp, [ip] @ IV is 1st arg on the stack
mov $len, $len, lsr#4 @ len in 16 byte blocks
sub sp, #0x10 @ scratch space to carry over the IV
mov $fp, sp @ save sp
ldr $rounds, [$key, #240] @ get # of rounds
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
add r12, #`128-32` @ sifze of bit-slices key schedule
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
mov sp, r12 @ sp is $keysched
bl _bsaes_key_convert
vldmia $keysched, {@XMM[6]}
vstmia r12, {@XMM[15]} @ save last round key
veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
vstmia $keysched, {@XMM[7]}
#else
ldr r12, [$key, #244]
eors r12, #1
beq 0f
@ populate the key schedule
str r12, [$key, #244]
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
add r12, $key, #248 @ pass key schedule
bl _bsaes_key_convert
add r4, $key, #248
vldmia r4, {@XMM[6]}
vstmia r12, {@XMM[15]} @ save last round key
veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
vstmia r4, {@XMM[7]}
.align 2
0:
#endif
vld1.8 {@XMM[15]}, [$ivp] @ load IV
b .Lcbc_dec_loop
.align 4
.Lcbc_dec_loop:
subs $len, $len, #0x8
bmi .Lcbc_dec_loop_finish
vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, $keysched @ pass the key
#else
add r4, $key, #248
#endif
vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
mov r5, $rounds
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
sub $inp, $inp, #0x60
vstmia $fp, {@XMM[15]} @ put aside IV
bl _bsaes_decrypt8
vldmia $fp, {@XMM[14]} @ reload IV
vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
veor @XMM[1], @XMM[1], @XMM[8]
veor @XMM[6], @XMM[6], @XMM[9]
vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
veor @XMM[4], @XMM[4], @XMM[10]
veor @XMM[2], @XMM[2], @XMM[11]
vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
veor @XMM[7], @XMM[7], @XMM[12]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
veor @XMM[3], @XMM[3], @XMM[13]
vst1.8 {@XMM[6]}, [$out]!
veor @XMM[5], @XMM[5], @XMM[14]
vst1.8 {@XMM[4]}, [$out]!
vst1.8 {@XMM[2]}, [$out]!
vst1.8 {@XMM[7]}, [$out]!
vst1.8 {@XMM[3]}, [$out]!
vst1.8 {@XMM[5]}, [$out]!
b .Lcbc_dec_loop
.Lcbc_dec_loop_finish:
adds $len, $len, #8
beq .Lcbc_dec_done
vld1.8 {@XMM[0]}, [$inp]! @ load input
cmp $len, #2
blo .Lcbc_dec_one
vld1.8 {@XMM[1]}, [$inp]!
#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, $keysched @ pass the key
#else
add r4, $key, #248
#endif
mov r5, $rounds
vstmia $fp, {@XMM[15]} @ put aside IV
beq .Lcbc_dec_two
vld1.8 {@XMM[2]}, [$inp]!
cmp $len, #4
blo .Lcbc_dec_three
vld1.8 {@XMM[3]}, [$inp]!
beq .Lcbc_dec_four
vld1.8 {@XMM[4]}, [$inp]!
cmp $len, #6
blo .Lcbc_dec_five
vld1.8 {@XMM[5]}, [$inp]!
beq .Lcbc_dec_six
vld1.8 {@XMM[6]}, [$inp]!
sub $inp, $inp, #0x70
bl _bsaes_decrypt8
vldmia $fp, {@XMM[14]} @ reload IV
vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
veor @XMM[1], @XMM[1], @XMM[8]
veor @XMM[6], @XMM[6], @XMM[9]
vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
veor @XMM[4], @XMM[4], @XMM[10]
veor @XMM[2], @XMM[2], @XMM[11]
vld1.8 {@XMM[15]}, [$inp]!
veor @XMM[7], @XMM[7], @XMM[12]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
veor @XMM[3], @XMM[3], @XMM[13]
vst1.8 {@XMM[6]}, [$out]!
vst1.8 {@XMM[4]}, [$out]!
vst1.8 {@XMM[2]}, [$out]!
vst1.8 {@XMM[7]}, [$out]!
vst1.8 {@XMM[3]}, [$out]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_six:
sub $inp, $inp, #0x60
bl _bsaes_decrypt8
vldmia $fp,{@XMM[14]} @ reload IV
vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
veor @XMM[1], @XMM[1], @XMM[8]
veor @XMM[6], @XMM[6], @XMM[9]
vld1.8 {@XMM[12]}, [$inp]!
veor @XMM[4], @XMM[4], @XMM[10]
veor @XMM[2], @XMM[2], @XMM[11]
vld1.8 {@XMM[15]}, [$inp]!
veor @XMM[7], @XMM[7], @XMM[12]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
vst1.8 {@XMM[6]}, [$out]!
vst1.8 {@XMM[4]}, [$out]!
vst1.8 {@XMM[2]}, [$out]!
vst1.8 {@XMM[7]}, [$out]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_five:
sub $inp, $inp, #0x50
bl _bsaes_decrypt8
vldmia $fp, {@XMM[14]} @ reload IV
vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
veor @XMM[1], @XMM[1], @XMM[8]
veor @XMM[6], @XMM[6], @XMM[9]
vld1.8 {@XMM[15]}, [$inp]!
veor @XMM[4], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
veor @XMM[2], @XMM[2], @XMM[11]
vst1.8 {@XMM[6]}, [$out]!
vst1.8 {@XMM[4]}, [$out]!
vst1.8 {@XMM[2]}, [$out]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_four:
sub $inp, $inp, #0x40
bl _bsaes_decrypt8
vldmia $fp, {@XMM[14]} @ reload IV
vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
vld1.8 {@XMM[10]}, [$inp]!
veor @XMM[1], @XMM[1], @XMM[8]
veor @XMM[6], @XMM[6], @XMM[9]
vld1.8 {@XMM[15]}, [$inp]!
veor @XMM[4], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
vst1.8 {@XMM[6]}, [$out]!
vst1.8 {@XMM[4]}, [$out]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_three:
sub $inp, $inp, #0x30
bl _bsaes_decrypt8
vldmia $fp, {@XMM[14]} @ reload IV
vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
vld1.8 {@XMM[15]}, [$inp]!
veor @XMM[1], @XMM[1], @XMM[8]
veor @XMM[6], @XMM[6], @XMM[9]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
vst1.8 {@XMM[6]}, [$out]!
b .Lcbc_dec_done
.align 4
.Lcbc_dec_two:
sub $inp, $inp, #0x20
bl _bsaes_decrypt8
vldmia $fp, {@XMM[14]} @ reload IV
vld1.8 {@XMM[8]}, [$inp]! @ reload input
veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
vld1.8 {@XMM[15]}, [$inp]! @ reload input
veor @XMM[1], @XMM[1], @XMM[8]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
b .Lcbc_dec_done
.align 4
.Lcbc_dec_one:
sub $inp, $inp, #0x10
mov $rounds, $out @ save original out pointer
mov $out, $fp @ use the iv scratch space as out buffer
mov r2, $key
vmov @XMM[4],@XMM[15] @ just in case ensure that IV
vmov @XMM[5],@XMM[0] @ and input are preserved
bl AES_decrypt
vld1.8 {@XMM[0]}, [$fp,:64] @ load result
veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
vst1.8 {@XMM[0]}, [$rounds] @ write output
.Lcbc_dec_done:
#ifndef BSAES_ASM_EXTENDED_KEY
vmov.i32 q0, #0
vmov.i32 q1, #0
.Lcbc_dec_bzero: @ wipe key schedule [if any]
vstmia $keysched!, {q0-q1}
cmp $keysched, $fp
bne .Lcbc_dec_bzero
#endif
mov sp, $fp
add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
vst1.8 {@XMM[15]}, [$ivp] @ return IV
VFP_ABI_POP
ldmia sp!, {r4-r10, pc}
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
___
}
{
my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
my $const = "r6"; # shared with _bsaes_encrypt8_alt
my $keysched = "sp";
$code.=<<___;
.extern AES_encrypt
.global bsaes_ctr32_encrypt_blocks
.type bsaes_ctr32_encrypt_blocks,%function
.align 5
bsaes_ctr32_encrypt_blocks:
cmp $len, #8 @ use plain AES for
blo .Lctr_enc_short @ small sizes
mov ip, sp
stmdb sp!, {r4-r10, lr}
VFP_ABI_PUSH
ldr $ctr, [ip] @ ctr is 1st arg on the stack
sub sp, sp, #0x10 @ scratch space to carry over the ctr
mov $fp, sp @ save sp
ldr $rounds, [$key, #240] @ get # of rounds
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
add r12, #`128-32` @ size of bit-sliced key schedule
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
mov sp, r12 @ sp is $keysched
bl _bsaes_key_convert
veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
vstmia r12, {@XMM[7]} @ save last round key
vld1.8 {@XMM[0]}, [$ctr] @ load counter
add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
vldmia $keysched, {@XMM[4]} @ load round0 key
#else
ldr r12, [$key, #244]
eors r12, #1
beq 0f
@ populate the key schedule
str r12, [$key, #244]
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
add r12, $key, #248 @ pass key schedule
bl _bsaes_key_convert
veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
vstmia r12, {@XMM[7]} @ save last round key
.align 2
0: add r12, $key, #248
vld1.8 {@XMM[0]}, [$ctr] @ load counter
adrl $ctr, .LREVM0SR @ borrow $ctr
vldmia r12, {@XMM[4]} @ load round0 key
sub sp, #0x10 @ place for adjusted round0 key
#endif
vmov.i32 @XMM[8],#1 @ compose 1<<96
veor @XMM[9],@XMM[9],@XMM[9]
vrev32.8 @XMM[0],@XMM[0]
vext.8 @XMM[8],@XMM[9],@XMM[8],#4
vrev32.8 @XMM[4],@XMM[4]
vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
b .Lctr_enc_loop
.align 4
.Lctr_enc_loop:
vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
vadd.u32 @XMM[4], @XMM[1], @XMM[10]
vadd.u32 @XMM[5], @XMM[2], @XMM[10]
vadd.u32 @XMM[6], @XMM[3], @XMM[10]
vadd.u32 @XMM[7], @XMM[4], @XMM[10]
vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
@ to flip byte order in 32-bit counter
vldmia $keysched, {@XMM[9]} @ load round0 key
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, $keysched, #0x10 @ pass next round key
#else
add r4, $key, #`248+16`
#endif
vldmia $ctr, {@XMM[8]} @ .LREVM0SR
mov r5, $rounds @ pass rounds
vstmia $fp, {@XMM[10]} @ save next counter
sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
bl _bsaes_encrypt8_alt
subs $len, $len, #8
blo .Lctr_enc_loop_done
vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
veor @XMM[0], @XMM[8]
veor @XMM[1], @XMM[9]
vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
veor @XMM[4], @XMM[10]
veor @XMM[6], @XMM[11]
vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
veor @XMM[3], @XMM[12]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
veor @XMM[7], @XMM[13]
veor @XMM[2], @XMM[14]
vst1.8 {@XMM[4]}, [$out]!
veor @XMM[5], @XMM[15]
vst1.8 {@XMM[6]}, [$out]!
vmov.i32 @XMM[8], #1 @ compose 1<<96
vst1.8 {@XMM[3]}, [$out]!
veor @XMM[9], @XMM[9], @XMM[9]
vst1.8 {@XMM[7]}, [$out]!
vext.8 @XMM[8], @XMM[9], @XMM[8], #4
vst1.8 {@XMM[2]}, [$out]!
vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
vst1.8 {@XMM[5]}, [$out]!
vldmia $fp, {@XMM[0]} @ load counter
bne .Lctr_enc_loop
b .Lctr_enc_done
.align 4
.Lctr_enc_loop_done:
add $len, $len, #8
vld1.8 {@XMM[8]}, [$inp]! @ load input
veor @XMM[0], @XMM[8]
vst1.8 {@XMM[0]}, [$out]! @ write output
cmp $len, #2
blo .Lctr_enc_done
vld1.8 {@XMM[9]}, [$inp]!
veor @XMM[1], @XMM[9]
vst1.8 {@XMM[1]}, [$out]!
beq .Lctr_enc_done
vld1.8 {@XMM[10]}, [$inp]!
veor @XMM[4], @XMM[10]
vst1.8 {@XMM[4]}, [$out]!
cmp $len, #4
blo .Lctr_enc_done
vld1.8 {@XMM[11]}, [$inp]!
veor @XMM[6], @XMM[11]
vst1.8 {@XMM[6]}, [$out]!
beq .Lctr_enc_done
vld1.8 {@XMM[12]}, [$inp]!
veor @XMM[3], @XMM[12]
vst1.8 {@XMM[3]}, [$out]!
cmp $len, #6
blo .Lctr_enc_done
vld1.8 {@XMM[13]}, [$inp]!
veor @XMM[7], @XMM[13]
vst1.8 {@XMM[7]}, [$out]!
beq .Lctr_enc_done
vld1.8 {@XMM[14]}, [$inp]
veor @XMM[2], @XMM[14]
vst1.8 {@XMM[2]}, [$out]!
.Lctr_enc_done:
vmov.i32 q0, #0
vmov.i32 q1, #0
#ifndef BSAES_ASM_EXTENDED_KEY
.Lctr_enc_bzero: @ wipe key schedule [if any]
vstmia $keysched!, {q0-q1}
cmp $keysched, $fp
bne .Lctr_enc_bzero
#else
vstmia $keysched, {q0-q1}
#endif
mov sp, $fp
add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.align 4
.Lctr_enc_short:
ldr ip, [sp] @ ctr pointer is passed on stack
stmdb sp!, {r4-r8, lr}
mov r4, $inp @ copy arguments
mov r5, $out
mov r6, $len
mov r7, $key
ldr r8, [ip, #12] @ load counter LSW
vld1.8 {@XMM[1]}, [ip] @ load whole counter value
#ifdef __ARMEL__
rev r8, r8
#endif
sub sp, sp, #0x10
vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
sub sp, sp, #0x10
.Lctr_enc_short_loop:
add r0, sp, #0x10 @ input counter value
mov r1, sp @ output on the stack
mov r2, r7 @ key
bl AES_encrypt
vld1.8 {@XMM[0]}, [r4]! @ load input
vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
add r8, r8, #1
#ifdef __ARMEL__
rev r0, r8
str r0, [sp, #0x1c] @ next counter value
#else
str r8, [sp, #0x1c] @ next counter value
#endif
veor @XMM[0],@XMM[0],@XMM[1]
vst1.8 {@XMM[0]}, [r5]! @ store output
subs r6, r6, #1
bne .Lctr_enc_short_loop
vmov.i32 q0, #0
vmov.i32 q1, #0
vstmia sp!, {q0-q1}
ldmia sp!, {r4-r8, pc}
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
___
}
{
######################################################################
# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2,
# const unsigned char iv[16]);
#
my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
my $const="r6"; # returned by _bsaes_key_convert
my $twmask=@XMM[5];
my @T=@XMM[6..7];
$code.=<<___;
.globl bsaes_xts_encrypt
.type bsaes_xts_encrypt,%function
.align 4
bsaes_xts_encrypt:
mov ip, sp
stmdb sp!, {r4-r10, lr} @ 0x20
VFP_ABI_PUSH
mov r6, sp @ future $fp
mov $inp, r0
mov $out, r1
mov $len, r2
mov $key, r3
sub r0, sp, #0x10 @ 0x10
bic r0, #0xf @ align at 16 bytes
mov sp, r0
#ifdef XTS_CHAIN_TWEAK
ldr r0, [ip] @ pointer to input tweak
#else
@ generate initial tweak
ldr r0, [ip, #4] @ iv[]
mov r1, sp
ldr r2, [ip, #0] @ key2
bl AES_encrypt
mov r0,sp @ pointer to initial tweak
#endif
ldr $rounds, [$key, #240] @ get # of rounds
mov $fp, r6
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
@ add r12, #`128-32` @ size of bit-sliced key schedule
sub r12, #`32+16` @ place for tweak[9]
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
mov sp, r12
add r12, #0x90 @ pass key schedule
bl _bsaes_key_convert
veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
vstmia r12, {@XMM[7]} @ save last round key
#else
ldr r12, [$key, #244]
eors r12, #1
beq 0f
str r12, [$key, #244]
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
add r12, $key, #248 @ pass key schedule
bl _bsaes_key_convert
veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
vstmia r12, {@XMM[7]}
.align 2
0: sub sp, #0x90 @ place for tweak[9]
#endif
vld1.8 {@XMM[8]}, [r0] @ initial tweak
adr $magic, .Lxts_magic
subs $len, #0x80
blo .Lxts_enc_short
b .Lxts_enc_loop
.align 4
.Lxts_enc_loop:
vldmia $magic, {$twmask} @ load XTS magic
vshr.s64 @T[0], @XMM[8], #63
mov r0, sp
vand @T[0], @T[0], $twmask
___
for($i=9;$i<16;$i++) {
$code.=<<___;
vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
vst1.64 {@XMM[$i-1]}, [r0,:128]!
vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
vshr.s64 @T[1], @XMM[$i], #63
veor @XMM[$i], @XMM[$i], @T[0]
vand @T[1], @T[1], $twmask
___
@T=reverse(@T);
$code.=<<___ if ($i>=10);
vld1.8 {@XMM[$i-10]}, [$inp]!
___
$code.=<<___ if ($i>=11);
veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
___
}
$code.=<<___;
vadd.u64 @XMM[8], @XMM[15], @XMM[15]
vst1.64 {@XMM[15]}, [r0,:128]!
vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
veor @XMM[8], @XMM[8], @T[0]
vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
veor @XMM[7], @XMM[7], @XMM[15]
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[6], @XMM[11]
vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
veor @XMM[10], @XMM[3], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
veor @XMM[11], @XMM[7], @XMM[13]
veor @XMM[12], @XMM[2], @XMM[14]
vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
veor @XMM[13], @XMM[5], @XMM[15]
vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
subs $len, #0x80
bpl .Lxts_enc_loop
.Lxts_enc_short:
adds $len, #0x70
bmi .Lxts_enc_done
vldmia $magic, {$twmask} @ load XTS magic
vshr.s64 @T[0], @XMM[8], #63
mov r0, sp
vand @T[0], @T[0], $twmask
___
for($i=9;$i<16;$i++) {
$code.=<<___;
vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
vst1.64 {@XMM[$i-1]}, [r0,:128]!
vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
vshr.s64 @T[1], @XMM[$i], #63
veor @XMM[$i], @XMM[$i], @T[0]
vand @T[1], @T[1], $twmask
___
@T=reverse(@T);
$code.=<<___ if ($i>=10);
vld1.8 {@XMM[$i-10]}, [$inp]!
subs $len, #0x10
bmi .Lxts_enc_`$i-9`
___
$code.=<<___ if ($i>=11);
veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
___
}
$code.=<<___;
sub $len, #0x10
vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
vld1.8 {@XMM[6]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[6], @XMM[11]
vld1.64 {@XMM[14]}, [r0,:128]!
veor @XMM[10], @XMM[3], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
veor @XMM[11], @XMM[7], @XMM[13]
veor @XMM[12], @XMM[2], @XMM[14]
vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
vst1.8 {@XMM[12]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_6:
vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
veor @XMM[4], @XMM[4], @XMM[12]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[5], @XMM[5], @XMM[13]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[6], @XMM[11]
veor @XMM[10], @XMM[3], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
veor @XMM[11], @XMM[7], @XMM[13]
vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_enc_done
@ put this in range for both ARM and Thumb mode adr instructions
.align 5
.Lxts_magic:
.quad 1, 0x87
.align 5
.Lxts_enc_5:
vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
veor @XMM[3], @XMM[3], @XMM[11]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[4], @XMM[4], @XMM[12]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[6], @XMM[11]
veor @XMM[10], @XMM[3], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
vst1.8 {@XMM[10]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_4:
vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
veor @XMM[2], @XMM[2], @XMM[10]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[3], @XMM[3], @XMM[11]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[6], @XMM[11]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_3:
vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
veor @XMM[1], @XMM[1], @XMM[9]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[2], @XMM[2], @XMM[10]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
vld1.64 {@XMM[10]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[4], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
vst1.8 {@XMM[8]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_2:
vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
veor @XMM[0], @XMM[0], @XMM[8]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[1], @XMM[1], @XMM[9]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_encrypt8
vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
veor @XMM[1], @XMM[1], @XMM[ 9]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_enc_done
.align 4
.Lxts_enc_1:
mov r0, sp
veor @XMM[0], @XMM[8]
mov r1, sp
vst1.8 {@XMM[0]}, [sp,:128]
mov r2, $key
mov r4, $fp @ preserve fp
bl AES_encrypt
vld1.8 {@XMM[0]}, [sp,:128]
veor @XMM[0], @XMM[0], @XMM[8]
vst1.8 {@XMM[0]}, [$out]!
mov $fp, r4
vmov @XMM[8], @XMM[9] @ next round tweak
.Lxts_enc_done:
#ifndef XTS_CHAIN_TWEAK
adds $len, #0x10
beq .Lxts_enc_ret
sub r6, $out, #0x10
.Lxts_enc_steal:
ldrb r0, [$inp], #1
ldrb r1, [$out, #-0x10]
strb r0, [$out, #-0x10]
strb r1, [$out], #1
subs $len, #1
bhi .Lxts_enc_steal
vld1.8 {@XMM[0]}, [r6]
mov r0, sp
veor @XMM[0], @XMM[0], @XMM[8]
mov r1, sp
vst1.8 {@XMM[0]}, [sp,:128]
mov r2, $key
mov r4, $fp @ preserve fp
bl AES_encrypt
vld1.8 {@XMM[0]}, [sp,:128]
veor @XMM[0], @XMM[0], @XMM[8]
vst1.8 {@XMM[0]}, [r6]
mov $fp, r4
#endif
.Lxts_enc_ret:
bic r0, $fp, #0xf
vmov.i32 q0, #0
vmov.i32 q1, #0
#ifdef XTS_CHAIN_TWEAK
ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
#endif
.Lxts_enc_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
cmp sp, r0
bne .Lxts_enc_bzero
mov sp, $fp
#ifdef XTS_CHAIN_TWEAK
vst1.8 {@XMM[8]}, [r1]
#endif
VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
.globl bsaes_xts_decrypt
.type bsaes_xts_decrypt,%function
.align 4
bsaes_xts_decrypt:
mov ip, sp
stmdb sp!, {r4-r10, lr} @ 0x20
VFP_ABI_PUSH
mov r6, sp @ future $fp
mov $inp, r0
mov $out, r1
mov $len, r2
mov $key, r3
sub r0, sp, #0x10 @ 0x10
bic r0, #0xf @ align at 16 bytes
mov sp, r0
#ifdef XTS_CHAIN_TWEAK
ldr r0, [ip] @ pointer to input tweak
#else
@ generate initial tweak
ldr r0, [ip, #4] @ iv[]
mov r1, sp
ldr r2, [ip, #0] @ key2
bl AES_encrypt
mov r0, sp @ pointer to initial tweak
#endif
ldr $rounds, [$key, #240] @ get # of rounds
mov $fp, r6
#ifndef BSAES_ASM_EXTENDED_KEY
@ allocate the key schedule on the stack
sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
@ add r12, #`128-32` @ size of bit-sliced key schedule
sub r12, #`32+16` @ place for tweak[9]
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
mov sp, r12
add r12, #0x90 @ pass key schedule
bl _bsaes_key_convert
add r4, sp, #0x90
vldmia r4, {@XMM[6]}
vstmia r12, {@XMM[15]} @ save last round key
veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
vstmia r4, {@XMM[7]}
#else
ldr r12, [$key, #244]
eors r12, #1
beq 0f
str r12, [$key, #244]
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
add r12, $key, #248 @ pass key schedule
bl _bsaes_key_convert
add r4, $key, #248
vldmia r4, {@XMM[6]}
vstmia r12, {@XMM[15]} @ save last round key
veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
vstmia r4, {@XMM[7]}
.align 2
0: sub sp, #0x90 @ place for tweak[9]
#endif
vld1.8 {@XMM[8]}, [r0] @ initial tweak
adr $magic, .Lxts_magic
tst $len, #0xf @ if not multiple of 16
it ne @ Thumb2 thing, sanity check in ARM
subne $len, #0x10 @ subtract another 16 bytes
subs $len, #0x80
blo .Lxts_dec_short
b .Lxts_dec_loop
.align 4
.Lxts_dec_loop:
vldmia $magic, {$twmask} @ load XTS magic
vshr.s64 @T[0], @XMM[8], #63
mov r0, sp
vand @T[0], @T[0], $twmask
___
for($i=9;$i<16;$i++) {
$code.=<<___;
vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
vst1.64 {@XMM[$i-1]}, [r0,:128]!
vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
vshr.s64 @T[1], @XMM[$i], #63
veor @XMM[$i], @XMM[$i], @T[0]
vand @T[1], @T[1], $twmask
___
@T=reverse(@T);
$code.=<<___ if ($i>=10);
vld1.8 {@XMM[$i-10]}, [$inp]!
___
$code.=<<___ if ($i>=11);
veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
___
}
$code.=<<___;
vadd.u64 @XMM[8], @XMM[15], @XMM[15]
vst1.64 {@XMM[15]}, [r0,:128]!
vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
veor @XMM[8], @XMM[8], @T[0]
vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
veor @XMM[7], @XMM[7], @XMM[15]
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[6], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[4], @XMM[11]
vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
veor @XMM[10], @XMM[2], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
veor @XMM[11], @XMM[7], @XMM[13]
veor @XMM[12], @XMM[3], @XMM[14]
vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
veor @XMM[13], @XMM[5], @XMM[15]
vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
subs $len, #0x80
bpl .Lxts_dec_loop
.Lxts_dec_short:
adds $len, #0x70
bmi .Lxts_dec_done
vldmia $magic, {$twmask} @ load XTS magic
vshr.s64 @T[0], @XMM[8], #63
mov r0, sp
vand @T[0], @T[0], $twmask
___
for($i=9;$i<16;$i++) {
$code.=<<___;
vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
vst1.64 {@XMM[$i-1]}, [r0,:128]!
vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
vshr.s64 @T[1], @XMM[$i], #63
veor @XMM[$i], @XMM[$i], @T[0]
vand @T[1], @T[1], $twmask
___
@T=reverse(@T);
$code.=<<___ if ($i>=10);
vld1.8 {@XMM[$i-10]}, [$inp]!
subs $len, #0x10
bmi .Lxts_dec_`$i-9`
___
$code.=<<___ if ($i>=11);
veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
___
}
$code.=<<___;
sub $len, #0x10
vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
vld1.8 {@XMM[6]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[6], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[4], @XMM[11]
vld1.64 {@XMM[14]}, [r0,:128]!
veor @XMM[10], @XMM[2], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
veor @XMM[11], @XMM[7], @XMM[13]
veor @XMM[12], @XMM[3], @XMM[14]
vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
vst1.8 {@XMM[12]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_6:
vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
veor @XMM[4], @XMM[4], @XMM[12]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[5], @XMM[5], @XMM[13]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[6], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[4], @XMM[11]
veor @XMM[10], @XMM[2], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
veor @XMM[11], @XMM[7], @XMM[13]
vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_5:
vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
veor @XMM[3], @XMM[3], @XMM[11]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[4], @XMM[4], @XMM[12]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
vld1.64 {@XMM[12]}, [r0,:128]!
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[6], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[4], @XMM[11]
veor @XMM[10], @XMM[2], @XMM[12]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
vst1.8 {@XMM[10]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_4:
vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
veor @XMM[2], @XMM[2], @XMM[10]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[3], @XMM[3], @XMM[11]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[6], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
veor @XMM[9], @XMM[4], @XMM[11]
vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_3:
vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
veor @XMM[1], @XMM[1], @XMM[9]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[2], @XMM[2], @XMM[10]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
vld1.64 {@XMM[10]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
veor @XMM[1], @XMM[1], @XMM[ 9]
veor @XMM[8], @XMM[6], @XMM[10]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
vst1.8 {@XMM[8]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_2:
vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
veor @XMM[0], @XMM[0], @XMM[8]
#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
#else
add r4, $key, #248 @ pass key schedule
#endif
veor @XMM[1], @XMM[1], @XMM[9]
mov r5, $rounds @ pass rounds
mov r0, sp
bl _bsaes_decrypt8
vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
veor @XMM[0], @XMM[0], @XMM[ 8]
veor @XMM[1], @XMM[1], @XMM[ 9]
vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_dec_done
.align 4
.Lxts_dec_1:
mov r0, sp
veor @XMM[0], @XMM[8]
mov r1, sp
vst1.8 {@XMM[0]}, [sp,:128]
mov r2, $key
mov r4, $fp @ preserve fp
mov r5, $magic @ preserve magic
bl AES_decrypt
vld1.8 {@XMM[0]}, [sp,:128]
veor @XMM[0], @XMM[0], @XMM[8]
vst1.8 {@XMM[0]}, [$out]!
mov $fp, r4
mov $magic, r5
vmov @XMM[8], @XMM[9] @ next round tweak
.Lxts_dec_done:
#ifndef XTS_CHAIN_TWEAK
adds $len, #0x10
beq .Lxts_dec_ret
@ calculate one round of extra tweak for the stolen ciphertext
vldmia $magic, {$twmask}
vshr.s64 @XMM[6], @XMM[8], #63
vand @XMM[6], @XMM[6], $twmask
vadd.u64 @XMM[9], @XMM[8], @XMM[8]
vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
veor @XMM[9], @XMM[9], @XMM[6]
@ perform the final decryption with the last tweak value
vld1.8 {@XMM[0]}, [$inp]!
mov r0, sp
veor @XMM[0], @XMM[0], @XMM[9]
mov r1, sp
vst1.8 {@XMM[0]}, [sp,:128]
mov r2, $key
mov r4, $fp @ preserve fp
bl AES_decrypt
vld1.8 {@XMM[0]}, [sp,:128]
veor @XMM[0], @XMM[0], @XMM[9]
vst1.8 {@XMM[0]}, [$out]
mov r6, $out
.Lxts_dec_steal:
ldrb r1, [$out]
ldrb r0, [$inp], #1
strb r1, [$out, #0x10]
strb r0, [$out], #1
subs $len, #1
bhi .Lxts_dec_steal
vld1.8 {@XMM[0]}, [r6]
mov r0, sp
veor @XMM[0], @XMM[8]
mov r1, sp
vst1.8 {@XMM[0]}, [sp,:128]
mov r2, $key
bl AES_decrypt
vld1.8 {@XMM[0]}, [sp,:128]
veor @XMM[0], @XMM[0], @XMM[8]
vst1.8 {@XMM[0]}, [r6]
mov $fp, r4
#endif
.Lxts_dec_ret:
bic r0, $fp, #0xf
vmov.i32 q0, #0
vmov.i32 q1, #0
#ifdef XTS_CHAIN_TWEAK
ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
#endif
.Lxts_dec_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
cmp sp, r0
bne .Lxts_dec_bzero
mov sp, $fp
#ifdef XTS_CHAIN_TWEAK
vst1.8 {@XMM[8]}, [r1]
#endif
VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
___
}
$code.=<<___;
#endif
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
print $code;
close STDOUT;
...@@ -24,6 +24,7 @@ generic-y += sembuf.h ...@@ -24,6 +24,7 @@ generic-y += sembuf.h
generic-y += serial.h generic-y += serial.h
generic-y += shmbuf.h generic-y += shmbuf.h
generic-y += siginfo.h generic-y += siginfo.h
generic-y += simd.h
generic-y += sizes.h generic-y += sizes.h
generic-y += socket.h generic-y += socket.h
generic-y += sockios.h generic-y += sockios.h
......
...@@ -53,6 +53,13 @@ ...@@ -53,6 +53,13 @@
#define put_byte_3 lsl #0 #define put_byte_3 lsl #0
#endif #endif
/* Select code for any configuration running in BE8 mode */
#ifdef CONFIG_CPU_ENDIAN_BE8
#define ARM_BE8(code...) code
#else
#define ARM_BE8(code...)
#endif
/* /*
* Data preload for architectures that support it * Data preload for architectures that support it
*/ */
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#define __ASM_ARM_ATOMIC_H #define __ASM_ARM_ATOMIC_H
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/prefetch.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/irqflags.h> #include <linux/irqflags.h>
#include <asm/barrier.h> #include <asm/barrier.h>
...@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v) ...@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v)
unsigned long tmp; unsigned long tmp;
int result; int result;
prefetchw(&v->counter);
__asm__ __volatile__("@ atomic_add\n" __asm__ __volatile__("@ atomic_add\n"
"1: ldrex %0, [%3]\n" "1: ldrex %0, [%3]\n"
" add %0, %0, %4\n" " add %0, %0, %4\n"
...@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v) ...@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v)
unsigned long tmp; unsigned long tmp;
int result; int result;
prefetchw(&v->counter);
__asm__ __volatile__("@ atomic_sub\n" __asm__ __volatile__("@ atomic_sub\n"
"1: ldrex %0, [%3]\n" "1: ldrex %0, [%3]\n"
" sub %0, %0, %4\n" " sub %0, %0, %4\n"
...@@ -260,6 +263,7 @@ static inline void atomic64_set(atomic64_t *v, long long i) ...@@ -260,6 +263,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
{ {
long long tmp; long long tmp;
prefetchw(&v->counter);
__asm__ __volatile__("@ atomic64_set\n" __asm__ __volatile__("@ atomic64_set\n"
"1: ldrexd %0, %H0, [%2]\n" "1: ldrexd %0, %H0, [%2]\n"
" strexd %0, %3, %H3, [%2]\n" " strexd %0, %3, %H3, [%2]\n"
...@@ -276,10 +280,11 @@ static inline void atomic64_add(long long i, atomic64_t *v) ...@@ -276,10 +280,11 @@ static inline void atomic64_add(long long i, atomic64_t *v)
long long result; long long result;
unsigned long tmp; unsigned long tmp;
prefetchw(&v->counter);
__asm__ __volatile__("@ atomic64_add\n" __asm__ __volatile__("@ atomic64_add\n"
"1: ldrexd %0, %H0, [%3]\n" "1: ldrexd %0, %H0, [%3]\n"
" adds %0, %0, %4\n" " adds %Q0, %Q0, %Q4\n"
" adc %H0, %H0, %H4\n" " adc %R0, %R0, %R4\n"
" strexd %1, %0, %H0, [%3]\n" " strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n" " teq %1, #0\n"
" bne 1b" " bne 1b"
...@@ -297,8 +302,8 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v) ...@@ -297,8 +302,8 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v)
__asm__ __volatile__("@ atomic64_add_return\n" __asm__ __volatile__("@ atomic64_add_return\n"
"1: ldrexd %0, %H0, [%3]\n" "1: ldrexd %0, %H0, [%3]\n"
" adds %0, %0, %4\n" " adds %Q0, %Q0, %Q4\n"
" adc %H0, %H0, %H4\n" " adc %R0, %R0, %R4\n"
" strexd %1, %0, %H0, [%3]\n" " strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n" " teq %1, #0\n"
" bne 1b" " bne 1b"
...@@ -316,10 +321,11 @@ static inline void atomic64_sub(long long i, atomic64_t *v) ...@@ -316,10 +321,11 @@ static inline void atomic64_sub(long long i, atomic64_t *v)
long long result; long long result;
unsigned long tmp; unsigned long tmp;
prefetchw(&v->counter);
__asm__ __volatile__("@ atomic64_sub\n" __asm__ __volatile__("@ atomic64_sub\n"
"1: ldrexd %0, %H0, [%3]\n" "1: ldrexd %0, %H0, [%3]\n"
" subs %0, %0, %4\n" " subs %Q0, %Q0, %Q4\n"
" sbc %H0, %H0, %H4\n" " sbc %R0, %R0, %R4\n"
" strexd %1, %0, %H0, [%3]\n" " strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n" " teq %1, #0\n"
" bne 1b" " bne 1b"
...@@ -337,8 +343,8 @@ static inline long long atomic64_sub_return(long long i, atomic64_t *v) ...@@ -337,8 +343,8 @@ static inline long long atomic64_sub_return(long long i, atomic64_t *v)
__asm__ __volatile__("@ atomic64_sub_return\n" __asm__ __volatile__("@ atomic64_sub_return\n"
"1: ldrexd %0, %H0, [%3]\n" "1: ldrexd %0, %H0, [%3]\n"
" subs %0, %0, %4\n" " subs %Q0, %Q0, %Q4\n"
" sbc %H0, %H0, %H4\n" " sbc %R0, %R0, %R4\n"
" strexd %1, %0, %H0, [%3]\n" " strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n" " teq %1, #0\n"
" bne 1b" " bne 1b"
...@@ -406,9 +412,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) ...@@ -406,9 +412,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
__asm__ __volatile__("@ atomic64_dec_if_positive\n" __asm__ __volatile__("@ atomic64_dec_if_positive\n"
"1: ldrexd %0, %H0, [%3]\n" "1: ldrexd %0, %H0, [%3]\n"
" subs %0, %0, #1\n" " subs %Q0, %Q0, #1\n"
" sbc %H0, %H0, #0\n" " sbc %R0, %R0, #0\n"
" teq %H0, #0\n" " teq %R0, #0\n"
" bmi 2f\n" " bmi 2f\n"
" strexd %1, %0, %H0, [%3]\n" " strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n" " teq %1, #0\n"
...@@ -437,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) ...@@ -437,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
" teqeq %H0, %H5\n" " teqeq %H0, %H5\n"
" moveq %1, #0\n" " moveq %1, #0\n"
" beq 2f\n" " beq 2f\n"
" adds %0, %0, %6\n" " adds %Q0, %Q0, %Q6\n"
" adc %H0, %H0, %H6\n" " adc %R0, %R0, %R6\n"
" strexd %2, %0, %H0, [%4]\n" " strexd %2, %0, %H0, [%4]\n"
" teq %2, #0\n" " teq %2, #0\n"
" bne 1b\n" " bne 1b\n"
......
/*
* arch/arm/include/asm/bL_switcher.h
*
* Created by: Nicolas Pitre, April 2012
* Copyright: (C) 2012-2013 Linaro Limited
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#ifndef ASM_BL_SWITCHER_H
#define ASM_BL_SWITCHER_H
#include <linux/compiler.h>
#include <linux/types.h>
typedef void (*bL_switch_completion_handler)(void *cookie);
int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
bL_switch_completion_handler completer,
void *completer_cookie);
static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
{
return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
}
/*
* Register here to be notified about runtime enabling/disabling of
* the switcher.
*
* The notifier chain is called with the switcher activation lock held:
* the switcher will not be enabled or disabled during callbacks.
* Callbacks must not call bL_switcher_{get,put}_enabled().
*/
#define BL_NOTIFY_PRE_ENABLE 0
#define BL_NOTIFY_POST_ENABLE 1
#define BL_NOTIFY_PRE_DISABLE 2
#define BL_NOTIFY_POST_DISABLE 3
#ifdef CONFIG_BL_SWITCHER
int bL_switcher_register_notifier(struct notifier_block *nb);
int bL_switcher_unregister_notifier(struct notifier_block *nb);
/*
* Use these functions to temporarily prevent enabling/disabling of
* the switcher.
* bL_switcher_get_enabled() returns true if the switcher is currently
* enabled. Each call to bL_switcher_get_enabled() must be followed
* by a call to bL_switcher_put_enabled(). These functions are not
* recursive.
*/
bool bL_switcher_get_enabled(void);
void bL_switcher_put_enabled(void);
int bL_switcher_trace_trigger(void);
int bL_switcher_get_logical_index(u32 mpidr);
#else
static inline int bL_switcher_register_notifier(struct notifier_block *nb)
{
return 0;
}
static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
{
return 0;
}
static inline bool bL_switcher_get_enabled(void) { return false; }
static inline void bL_switcher_put_enabled(void) { }
static inline int bL_switcher_trace_trigger(void) { return 0; }
static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
#endif /* CONFIG_BL_SWITCHER */
#endif
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#define _ASMARM_BUG_H #define _ASMARM_BUG_H
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/types.h>
#include <asm/opcodes.h>
#ifdef CONFIG_BUG #ifdef CONFIG_BUG
...@@ -12,10 +14,10 @@ ...@@ -12,10 +14,10 @@
*/ */
#ifdef CONFIG_THUMB2_KERNEL #ifdef CONFIG_THUMB2_KERNEL
#define BUG_INSTR_VALUE 0xde02 #define BUG_INSTR_VALUE 0xde02
#define BUG_INSTR_TYPE ".hword " #define BUG_INSTR(__value) __inst_thumb16(__value)
#else #else
#define BUG_INSTR_VALUE 0xe7f001f2 #define BUG_INSTR_VALUE 0xe7f001f2
#define BUG_INSTR_TYPE ".word " #define BUG_INSTR(__value) __inst_arm(__value)
#endif #endif
...@@ -33,7 +35,7 @@ ...@@ -33,7 +35,7 @@
#define __BUG(__file, __line, __value) \ #define __BUG(__file, __line, __value) \
do { \ do { \
asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n" \ asm volatile("1:\t" BUG_INSTR(__value) "\n" \
".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \ ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \
"2:\t.asciz " #__file "\n" \ "2:\t.asciz " #__file "\n" \
".popsection\n" \ ".popsection\n" \
...@@ -48,7 +50,7 @@ do { \ ...@@ -48,7 +50,7 @@ do { \
#define __BUG(__file, __line, __value) \ #define __BUG(__file, __line, __value) \
do { \ do { \
asm volatile(BUG_INSTR_TYPE #__value); \ asm volatile(BUG_INSTR(__value) "\n"); \
unreachable(); \ unreachable(); \
} while (0) } while (0)
#endif /* CONFIG_DEBUG_BUGVERBOSE */ #endif /* CONFIG_DEBUG_BUGVERBOSE */
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include <linux/threads.h> #include <linux/threads.h>
#include <asm/irq.h> #include <asm/irq.h>
#define NR_IPI 7 #define NR_IPI 8
typedef struct { typedef struct {
unsigned int __softirq_pending; unsigned int __softirq_pending;
......
...@@ -24,8 +24,8 @@ ...@@ -24,8 +24,8 @@
#define TRACER_TIMEOUT 10000 #define TRACER_TIMEOUT 10000
#define etm_writel(t, v, x) \ #define etm_writel(t, v, x) \
(__raw_writel((v), (t)->etm_regs + (x))) (writel_relaxed((v), (t)->etm_regs + (x)))
#define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x))) #define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x)))
/* CoreSight Management Registers */ /* CoreSight Management Registers */
#define CSMR_LOCKACCESS 0xfb0 #define CSMR_LOCKACCESS 0xfb0
...@@ -142,8 +142,8 @@ ...@@ -142,8 +142,8 @@
#define ETBFF_TRIGFL BIT(10) #define ETBFF_TRIGFL BIT(10)
#define etb_writel(t, v, x) \ #define etb_writel(t, v, x) \
(__raw_writel((v), (t)->etb_regs + (x))) (writel_relaxed((v), (t)->etb_regs + (x)))
#define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x))) #define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x)))
#define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0) #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0)
#define etm_unlock(t) \ #define etm_unlock(t) \
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#define __ARM_KGDB_H__ #define __ARM_KGDB_H__
#include <linux/ptrace.h> #include <linux/ptrace.h>
#include <asm/opcodes.h>
/* /*
* GDB assumes that we're a user process being debugged, so * GDB assumes that we're a user process being debugged, so
...@@ -41,7 +42,7 @@ ...@@ -41,7 +42,7 @@
static inline void arch_kgdb_breakpoint(void) static inline void arch_kgdb_breakpoint(void)
{ {
asm(".word 0xe7ffdeff"); asm(__inst_arm(0xe7ffdeff));
} }
extern void kgdb_handle_bus_error(void); extern void kgdb_handle_bus_error(void);
......
...@@ -49,6 +49,7 @@ struct machine_desc { ...@@ -49,6 +49,7 @@ struct machine_desc {
bool (*smp_init)(void); bool (*smp_init)(void);
void (*fixup)(struct tag *, char **, void (*fixup)(struct tag *, char **,
struct meminfo *); struct meminfo *);
void (*init_meminfo)(void);
void (*reserve)(void);/* reserve mem blocks */ void (*reserve)(void);/* reserve mem blocks */
void (*map_io)(void);/* IO mapping function */ void (*map_io)(void);/* IO mapping function */
void (*init_early)(void); void (*init_early)(void);
......
...@@ -41,6 +41,14 @@ extern void mcpm_entry_point(void); ...@@ -41,6 +41,14 @@ extern void mcpm_entry_point(void);
*/ */
void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
/*
* This sets an early poke i.e a value to be poked into some address
* from very early assembly code before the CPU is ungated. The
* address must be physical, and if 0 then nothing will happen.
*/
void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
unsigned long poke_phys_addr, unsigned long poke_val);
/* /*
* CPU/cluster power operations API for higher subsystems to use. * CPU/cluster power operations API for higher subsystems to use.
*/ */
......
...@@ -172,8 +172,13 @@ ...@@ -172,8 +172,13 @@
* so that all we need to do is modify the 8-bit constant field. * so that all we need to do is modify the 8-bit constant field.
*/ */
#define __PV_BITS_31_24 0x81000000 #define __PV_BITS_31_24 0x81000000
#define __PV_BITS_7_0 0x81
extern u64 __pv_phys_offset;
extern u64 __pv_offset;
extern void fixup_pv_table(const void *, unsigned long);
extern const void *__pv_table_begin, *__pv_table_end;
extern unsigned long __pv_phys_offset;
#define PHYS_OFFSET __pv_phys_offset #define PHYS_OFFSET __pv_phys_offset
#define __pv_stub(from,to,instr,type) \ #define __pv_stub(from,to,instr,type) \
...@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset; ...@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset;
: "=r" (to) \ : "=r" (to) \
: "r" (from), "I" (type)) : "r" (from), "I" (type))
static inline unsigned long __virt_to_phys(unsigned long x) #define __pv_stub_mov_hi(t) \
__asm__ volatile("@ __pv_stub_mov\n" \
"1: mov %R0, %1\n" \
" .pushsection .pv_table,\"a\"\n" \
" .long 1b\n" \
" .popsection\n" \
: "=r" (t) \
: "I" (__PV_BITS_7_0))
#define __pv_add_carry_stub(x, y) \
__asm__ volatile("@ __pv_add_carry_stub\n" \
"1: adds %Q0, %1, %2\n" \
" adc %R0, %R0, #0\n" \
" .pushsection .pv_table,\"a\"\n" \
" .long 1b\n" \
" .popsection\n" \
: "+r" (y) \
: "r" (x), "I" (__PV_BITS_31_24) \
: "cc")
static inline phys_addr_t __virt_to_phys(unsigned long x)
{ {
unsigned long t; phys_addr_t t;
__pv_stub(x, t, "add", __PV_BITS_31_24);
if (sizeof(phys_addr_t) == 4) {
__pv_stub(x, t, "add", __PV_BITS_31_24);
} else {
__pv_stub_mov_hi(t);
__pv_add_carry_stub(x, t);
}
return t; return t;
} }
static inline unsigned long __phys_to_virt(unsigned long x) static inline unsigned long __phys_to_virt(phys_addr_t x)
{ {
unsigned long t; unsigned long t;
__pv_stub(x, t, "sub", __PV_BITS_31_24); __pv_stub(x, t, "sub", __PV_BITS_31_24);
return t; return t;
} }
#else #else
#define __virt_to_phys(x) ((x) - PAGE_OFFSET + PHYS_OFFSET)
#define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET) static inline phys_addr_t __virt_to_phys(unsigned long x)
{
return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
}
static inline unsigned long __phys_to_virt(phys_addr_t x)
{
return x - PHYS_OFFSET + PAGE_OFFSET;
}
#endif #endif
#endif #endif
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
...@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x) ...@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x)
static inline void *phys_to_virt(phys_addr_t x) static inline void *phys_to_virt(phys_addr_t x)
{ {
return (void *)(__phys_to_virt((unsigned long)(x))); return (void *)__phys_to_virt(x);
} }
/* /*
* Drivers should NOT use these either. * Drivers should NOT use these either.
*/ */
#define __pa(x) __virt_to_phys((unsigned long)(x)) #define __pa(x) __virt_to_phys((unsigned long)(x))
#define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) #define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x);
/*
* These are for systems that have a hardware interconnect supported alias of
* physical memory for idmap purposes. Most cases should leave these
* untouched.
*/
static inline phys_addr_t __virt_to_idmap(unsigned long x)
{
if (arch_virt_to_idmap)
return arch_virt_to_idmap(x);
else
return __virt_to_phys(x);
}
#define virt_to_idmap(x) __virt_to_idmap((unsigned long)(x))
/* /*
* Virtual <-> DMA view memory address translations * Virtual <-> DMA view memory address translations
* Again, these are *only* valid on the kernel direct mapped RAM * Again, these are *only* valid on the kernel direct mapped RAM
......
...@@ -16,7 +16,7 @@ typedef struct { ...@@ -16,7 +16,7 @@ typedef struct {
#ifdef CONFIG_CPU_HAS_ASID #ifdef CONFIG_CPU_HAS_ASID
#define ASID_BITS 8 #define ASID_BITS 8
#define ASID_MASK ((~0ULL) << ASID_BITS) #define ASID_MASK ((~0ULL) << ASID_BITS)
#define ASID(mm) ((mm)->context.id.counter & ~ASID_MASK) #define ASID(mm) ((unsigned int)((mm)->context.id.counter & ~ASID_MASK))
#else #else
#define ASID(mm) (0) #define ASID(mm) (0)
#endif #endif
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <asm/hw_breakpoint.h> #include <asm/hw_breakpoint.h>
#include <asm/ptrace.h> #include <asm/ptrace.h>
#include <asm/types.h> #include <asm/types.h>
#include <asm/unified.h>
#ifdef __KERNEL__ #ifdef __KERNEL__
#define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \ #define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \
...@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p); ...@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p);
#define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc #define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc
#define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp #define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp
#ifdef CONFIG_SMP
#define __ALT_SMP_ASM(smp, up) \
"9998: " smp "\n" \
" .pushsection \".alt.smp.init\", \"a\"\n" \
" .long 9998b\n" \
" " up "\n" \
" .popsection\n"
#else
#define __ALT_SMP_ASM(smp, up) up
#endif
/* /*
* Prefetching support - only ARMv5. * Prefetching support - only ARMv5.
*/ */
...@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr) ...@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr)
{ {
__asm__ __volatile__( __asm__ __volatile__(
"pld\t%a0" "pld\t%a0"
: :: "p" (ptr));
: "p" (ptr)
: "cc");
} }
#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
#define ARCH_HAS_PREFETCHW #define ARCH_HAS_PREFETCHW
#define prefetchw(ptr) prefetch(ptr) static inline void prefetchw(const void *ptr)
{
#define ARCH_HAS_SPINLOCK_PREFETCH __asm__ __volatile__(
#define spin_lock_prefetch(x) do { } while (0) ".arch_extension mp\n"
__ALT_SMP_ASM(
WASM(pldw) "\t%a0",
WASM(pld) "\t%a0"
)
:: "p" (ptr));
}
#endif
#endif #endif
#define HAVE_ARCH_PICK_MMAP_LAYOUT #define HAVE_ARCH_PICK_MMAP_LAYOUT
......
...@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu); ...@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu);
extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
extern int register_ipi_completion(struct completion *completion, int cpu);
struct smp_operations { struct smp_operations {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
......
...@@ -5,21 +5,13 @@ ...@@ -5,21 +5,13 @@
#error SMP not supported on pre-ARMv6 CPUs #error SMP not supported on pre-ARMv6 CPUs
#endif #endif
#include <asm/processor.h> #include <linux/prefetch.h>
/* /*
* sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K
* extensions, so when running on UP, we have to patch these instructions away. * extensions, so when running on UP, we have to patch these instructions away.
*/ */
#define ALT_SMP(smp, up) \
"9998: " smp "\n" \
" .pushsection \".alt.smp.init\", \"a\"\n" \
" .long 9998b\n" \
" " up "\n" \
" .popsection\n"
#ifdef CONFIG_THUMB2_KERNEL #ifdef CONFIG_THUMB2_KERNEL
#define SEV ALT_SMP("sev.w", "nop.w")
/* /*
* For Thumb-2, special care is needed to ensure that the conditional WFE * For Thumb-2, special care is needed to ensure that the conditional WFE
* instruction really does assemble to exactly 4 bytes (as required by * instruction really does assemble to exactly 4 bytes (as required by
...@@ -31,17 +23,18 @@ ...@@ -31,17 +23,18 @@
* the assembler won't change IT instructions which are explicitly present * the assembler won't change IT instructions which are explicitly present
* in the input. * in the input.
*/ */
#define WFE(cond) ALT_SMP( \ #define WFE(cond) __ALT_SMP_ASM( \
"it " cond "\n\t" \ "it " cond "\n\t" \
"wfe" cond ".n", \ "wfe" cond ".n", \
\ \
"nop.w" \ "nop.w" \
) )
#else #else
#define SEV ALT_SMP("sev", "nop") #define WFE(cond) __ALT_SMP_ASM("wfe" cond, "nop")
#define WFE(cond) ALT_SMP("wfe" cond, "nop")
#endif #endif
#define SEV __ALT_SMP_ASM(WASM(sev), WASM(nop))
static inline void dsb_sev(void) static inline void dsb_sev(void)
{ {
#if __LINUX_ARM_ARCH__ >= 7 #if __LINUX_ARM_ARCH__ >= 7
...@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) ...@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
u32 newval; u32 newval;
arch_spinlock_t lockval; arch_spinlock_t lockval;
prefetchw(&lock->slock);
__asm__ __volatile__( __asm__ __volatile__(
"1: ldrex %0, [%3]\n" "1: ldrex %0, [%3]\n"
" add %1, %0, %4\n" " add %1, %0, %4\n"
...@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) ...@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
unsigned long contended, res; unsigned long contended, res;
u32 slock; u32 slock;
prefetchw(&lock->slock);
do { do {
__asm__ __volatile__( __asm__ __volatile__(
" ldrex %0, [%3]\n" " ldrex %0, [%3]\n"
...@@ -156,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw) ...@@ -156,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
{ {
unsigned long tmp; unsigned long tmp;
prefetchw(&rw->lock);
__asm__ __volatile__( __asm__ __volatile__(
"1: ldrex %0, [%1]\n" "1: ldrex %0, [%1]\n"
" teq %0, #0\n" " teq %0, #0\n"
...@@ -174,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) ...@@ -174,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
{ {
unsigned long contended, res; unsigned long contended, res;
prefetchw(&rw->lock);
do { do {
__asm__ __volatile__( __asm__ __volatile__(
" ldrex %0, [%2]\n" " ldrex %0, [%2]\n"
...@@ -207,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) ...@@ -207,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
} }
/* write_can_lock - would write_trylock() succeed? */ /* write_can_lock - would write_trylock() succeed? */
#define arch_write_can_lock(x) ((x)->lock == 0) #define arch_write_can_lock(x) (ACCESS_ONCE((x)->lock) == 0)
/* /*
* Read locks are a bit more hairy: * Read locks are a bit more hairy:
...@@ -225,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw) ...@@ -225,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
{ {
unsigned long tmp, tmp2; unsigned long tmp, tmp2;
prefetchw(&rw->lock);
__asm__ __volatile__( __asm__ __volatile__(
"1: ldrex %0, [%2]\n" "1: ldrex %0, [%2]\n"
" adds %0, %0, #1\n" " adds %0, %0, #1\n"
...@@ -245,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) ...@@ -245,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
smp_mb(); smp_mb();
prefetchw(&rw->lock);
__asm__ __volatile__( __asm__ __volatile__(
"1: ldrex %0, [%2]\n" "1: ldrex %0, [%2]\n"
" sub %0, %0, #1\n" " sub %0, %0, #1\n"
...@@ -263,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) ...@@ -263,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
{ {
unsigned long contended, res; unsigned long contended, res;
prefetchw(&rw->lock);
do { do {
__asm__ __volatile__( __asm__ __volatile__(
" ldrex %0, [%2]\n" " ldrex %0, [%2]\n"
...@@ -284,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) ...@@ -284,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
} }
/* read_can_lock - would read_trylock() succeed? */ /* read_can_lock - would read_trylock() succeed? */
#define arch_read_can_lock(x) ((x)->lock < 0x80000000) #define arch_read_can_lock(x) (ACCESS_ONCE((x)->lock) < 0x80000000)
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
......
...@@ -25,7 +25,7 @@ typedef struct { ...@@ -25,7 +25,7 @@ typedef struct {
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
typedef struct { typedef struct {
volatile unsigned int lock; u32 lock;
} arch_rwlock_t; } arch_rwlock_t;
#define __ARCH_RW_LOCK_UNLOCKED { 0 } #define __ARCH_RW_LOCK_UNLOCKED { 0 }
......
...@@ -38,6 +38,8 @@ ...@@ -38,6 +38,8 @@
#ifdef __ASSEMBLY__ #ifdef __ASSEMBLY__
#define W(instr) instr.w #define W(instr) instr.w
#define BSYM(sym) sym + 1 #define BSYM(sym) sym + 1
#else
#define WASM(instr) #instr ".w"
#endif #endif
#else /* !CONFIG_THUMB2_KERNEL */ #else /* !CONFIG_THUMB2_KERNEL */
...@@ -50,6 +52,8 @@ ...@@ -50,6 +52,8 @@
#ifdef __ASSEMBLY__ #ifdef __ASSEMBLY__
#define W(instr) instr #define W(instr) instr
#define BSYM(sym) sym #define BSYM(sym) sym
#else
#define WASM(instr) #instr
#endif #endif
#endif /* CONFIG_THUMB2_KERNEL */ #endif /* CONFIG_THUMB2_KERNEL */
......
...@@ -25,12 +25,14 @@ ...@@ -25,12 +25,14 @@
.macro waituart,rd,rx .macro waituart,rd,rx
1001: ldr \rd, [\rx, #UART01x_FR] 1001: ldr \rd, [\rx, #UART01x_FR]
ARM_BE8( rev \rd, \rd )
tst \rd, #UART01x_FR_TXFF tst \rd, #UART01x_FR_TXFF
bne 1001b bne 1001b
.endm .endm
.macro busyuart,rd,rx .macro busyuart,rd,rx
1001: ldr \rd, [\rx, #UART01x_FR] 1001: ldr \rd, [\rx, #UART01x_FR]
ARM_BE8( rev \rd, \rd )
tst \rd, #UART01x_FR_BUSY tst \rd, #UART01x_FR_BUSY
bne 1001b bne 1001b
.endm .endm
...@@ -7,6 +7,7 @@ header-y += hwcap.h ...@@ -7,6 +7,7 @@ header-y += hwcap.h
header-y += ioctls.h header-y += ioctls.h
header-y += kvm_para.h header-y += kvm_para.h
header-y += mman.h header-y += mman.h
header-y += perf_regs.h
header-y += posix_types.h header-y += posix_types.h
header-y += ptrace.h header-y += ptrace.h
header-y += setup.h header-y += setup.h
......
#ifndef _ASM_ARM_PERF_REGS_H
#define _ASM_ARM_PERF_REGS_H
enum perf_event_arm_regs {
PERF_REG_ARM_R0,
PERF_REG_ARM_R1,
PERF_REG_ARM_R2,
PERF_REG_ARM_R3,
PERF_REG_ARM_R4,
PERF_REG_ARM_R5,
PERF_REG_ARM_R6,
PERF_REG_ARM_R7,
PERF_REG_ARM_R8,
PERF_REG_ARM_R9,
PERF_REG_ARM_R10,
PERF_REG_ARM_FP,
PERF_REG_ARM_IP,
PERF_REG_ARM_SP,
PERF_REG_ARM_LR,
PERF_REG_ARM_PC,
PERF_REG_ARM_MAX,
};
#endif /* _ASM_ARM_PERF_REGS_H */
...@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg ...@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg
obj-y := elf.o entry-common.o irq.o opcodes.o \ obj-y := elf.o entry-common.o irq.o opcodes.o \
process.o ptrace.o return_address.o \ process.o ptrace.o return_address.o \
setup.o signal.o stacktrace.o sys_arm.o time.o traps.o setup.o signal.o sigreturn_codes.o \
stacktrace.o sys_arm.o time.o traps.o
obj-$(CONFIG_ATAGS) += atags_parse.o obj-$(CONFIG_ATAGS) += atags_parse.o
obj-$(CONFIG_ATAGS_PROC) += atags_proc.o obj-$(CONFIG_ATAGS_PROC) += atags_proc.o
...@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3) += xscale-cp0.o ...@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3) += xscale-cp0.o
obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o
obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o
obj-$(CONFIG_IWMMXT) += iwmmxt.o obj-$(CONFIG_IWMMXT) += iwmmxt.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o
AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt
obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o
......
...@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc); ...@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
#ifdef CONFIG_ARM_PATCH_PHYS_VIRT #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
EXPORT_SYMBOL(__pv_phys_offset); EXPORT_SYMBOL(__pv_phys_offset);
EXPORT_SYMBOL(__pv_offset);
#endif #endif
...@@ -416,9 +416,8 @@ __und_usr: ...@@ -416,9 +416,8 @@ __und_usr:
bne __und_usr_thumb bne __und_usr_thumb
sub r4, r2, #4 @ ARM instr at LR - 4 sub r4, r2, #4 @ ARM instr at LR - 4
1: ldrt r0, [r4] 1: ldrt r0, [r4]
#ifdef CONFIG_CPU_ENDIAN_BE8 ARM_BE8(rev r0, r0) @ little endian instruction
rev r0, r0 @ little endian instruction
#endif
@ r0 = 32-bit ARM instruction which caused the exception @ r0 = 32-bit ARM instruction which caused the exception
@ r2 = PC value for the following instruction (:= regs->ARM_pc) @ r2 = PC value for the following instruction (:= regs->ARM_pc)
@ r4 = PC value for the faulting instruction @ r4 = PC value for the faulting instruction
......
...@@ -393,9 +393,7 @@ ENTRY(vector_swi) ...@@ -393,9 +393,7 @@ ENTRY(vector_swi)
#else #else
USER( ldr r10, [lr, #-4] ) @ get SWI instruction USER( ldr r10, [lr, #-4] ) @ get SWI instruction
#endif #endif
#ifdef CONFIG_CPU_ENDIAN_BE8 ARM_BE8(rev r10, r10) @ little endian instruction
rev r10, r10 @ little endian instruction
#endif
#elif defined(CONFIG_AEABI) #elif defined(CONFIG_AEABI)
......
...@@ -77,6 +77,7 @@ ...@@ -77,6 +77,7 @@
__HEAD __HEAD
ENTRY(stext) ENTRY(stext)
ARM_BE8(setend be ) @ ensure we are in BE8 mode
THUMB( adr r9, BSYM(1f) ) @ Kernel is always entered in ARM. THUMB( adr r9, BSYM(1f) ) @ Kernel is always entered in ARM.
THUMB( bx r9 ) @ If this is a Thumb-2 kernel, THUMB( bx r9 ) @ If this is a Thumb-2 kernel,
...@@ -352,6 +353,9 @@ ENTRY(secondary_startup) ...@@ -352,6 +353,9 @@ ENTRY(secondary_startup)
* the processor type - there is no need to check the machine type * the processor type - there is no need to check the machine type
* as it has already been validated by the primary processor. * as it has already been validated by the primary processor.
*/ */
ARM_BE8(setend be) @ ensure we are in BE8 mode
#ifdef CONFIG_ARM_VIRT_EXT #ifdef CONFIG_ARM_VIRT_EXT
bl __hyp_stub_install_secondary bl __hyp_stub_install_secondary
#endif #endif
...@@ -555,6 +559,14 @@ ENTRY(fixup_smp) ...@@ -555,6 +559,14 @@ ENTRY(fixup_smp)
ldmfd sp!, {r4 - r6, pc} ldmfd sp!, {r4 - r6, pc}
ENDPROC(fixup_smp) ENDPROC(fixup_smp)
#ifdef __ARMEB__
#define LOW_OFFSET 0x4
#define HIGH_OFFSET 0x0
#else
#define LOW_OFFSET 0x0
#define HIGH_OFFSET 0x4
#endif
#ifdef CONFIG_ARM_PATCH_PHYS_VIRT #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
/* __fixup_pv_table - patch the stub instructions with the delta between /* __fixup_pv_table - patch the stub instructions with the delta between
...@@ -565,17 +577,20 @@ ENDPROC(fixup_smp) ...@@ -565,17 +577,20 @@ ENDPROC(fixup_smp)
__HEAD __HEAD
__fixup_pv_table: __fixup_pv_table:
adr r0, 1f adr r0, 1f
ldmia r0, {r3-r5, r7} ldmia r0, {r3-r7}
sub r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET mvn ip, #0
subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET
add r4, r4, r3 @ adjust table start address add r4, r4, r3 @ adjust table start address
add r5, r5, r3 @ adjust table end address add r5, r5, r3 @ adjust table end address
add r7, r7, r3 @ adjust __pv_phys_offset address add r6, r6, r3 @ adjust __pv_phys_offset address
str r8, [r7] @ save computed PHYS_OFFSET to __pv_phys_offset add r7, r7, r3 @ adjust __pv_offset address
str r8, [r6, #LOW_OFFSET] @ save computed PHYS_OFFSET to __pv_phys_offset
strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits
mov r6, r3, lsr #24 @ constant for add/sub instructions mov r6, r3, lsr #24 @ constant for add/sub instructions
teq r3, r6, lsl #24 @ must be 16MiB aligned teq r3, r6, lsl #24 @ must be 16MiB aligned
THUMB( it ne @ cross section branch ) THUMB( it ne @ cross section branch )
bne __error bne __error
str r6, [r7, #4] @ save to __pv_offset str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits
b __fixup_a_pv_table b __fixup_a_pv_table
ENDPROC(__fixup_pv_table) ENDPROC(__fixup_pv_table)
...@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table) ...@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table)
.long __pv_table_begin .long __pv_table_begin
.long __pv_table_end .long __pv_table_end
2: .long __pv_phys_offset 2: .long __pv_phys_offset
.long __pv_offset
.text .text
__fixup_a_pv_table: __fixup_a_pv_table:
adr r0, 3f
ldr r6, [r0]
add r6, r6, r3
ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word
ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word
mov r6, r6, lsr #24
cmn r0, #1
#ifdef CONFIG_THUMB2_KERNEL #ifdef CONFIG_THUMB2_KERNEL
moveq r0, #0x200000 @ set bit 21, mov to mvn instruction
lsls r6, #24 lsls r6, #24
beq 2f beq 2f
clz r7, r6 clz r7, r6
...@@ -601,18 +625,42 @@ __fixup_a_pv_table: ...@@ -601,18 +625,42 @@ __fixup_a_pv_table:
b 2f b 2f
1: add r7, r3 1: add r7, r3
ldrh ip, [r7, #2] ldrh ip, [r7, #2]
and ip, 0x8f00 ARM_BE8(rev16 ip, ip)
orr ip, r6 @ mask in offset bits 31-24 tst ip, #0x4000
and ip, #0x8f00
orrne ip, r6 @ mask in offset bits 31-24
orreq ip, r0 @ mask in offset bits 7-0
ARM_BE8(rev16 ip, ip)
strh ip, [r7, #2] strh ip, [r7, #2]
bne 2f
ldrh ip, [r7]
ARM_BE8(rev16 ip, ip)
bic ip, #0x20
orr ip, ip, r0, lsr #16
ARM_BE8(rev16 ip, ip)
strh ip, [r7]
2: cmp r4, r5 2: cmp r4, r5
ldrcc r7, [r4], #4 @ use branch for delay slot ldrcc r7, [r4], #4 @ use branch for delay slot
bcc 1b bcc 1b
bx lr bx lr
#else #else
moveq r0, #0x400000 @ set bit 22, mov to mvn instruction
b 2f b 2f
1: ldr ip, [r7, r3] 1: ldr ip, [r7, r3]
#ifdef CONFIG_CPU_ENDIAN_BE8
@ in BE8, we load data in BE, but instructions still in LE
bic ip, ip, #0xff000000
tst ip, #0x000f0000 @ check the rotation field
orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24
biceq ip, ip, #0x00004000 @ clear bit 22
orreq ip, ip, r0, lsl #24 @ mask in offset bits 7-0
#else
bic ip, ip, #0x000000ff bic ip, ip, #0x000000ff
orr ip, ip, r6 @ mask in offset bits 31-24 tst ip, #0xf00 @ check the rotation field
orrne ip, ip, r6 @ mask in offset bits 31-24
biceq ip, ip, #0x400000 @ clear bit 22
orreq ip, ip, r0 @ mask in offset bits 7-0
#endif
str ip, [r7, r3] str ip, [r7, r3]
2: cmp r4, r5 2: cmp r4, r5
ldrcc r7, [r4], #4 @ use branch for delay slot ldrcc r7, [r4], #4 @ use branch for delay slot
...@@ -621,28 +669,30 @@ __fixup_a_pv_table: ...@@ -621,28 +669,30 @@ __fixup_a_pv_table:
#endif #endif
ENDPROC(__fixup_a_pv_table) ENDPROC(__fixup_a_pv_table)
.align
3: .long __pv_offset
ENTRY(fixup_pv_table) ENTRY(fixup_pv_table)
stmfd sp!, {r4 - r7, lr} stmfd sp!, {r4 - r7, lr}
ldr r2, 2f @ get address of __pv_phys_offset
mov r3, #0 @ no offset mov r3, #0 @ no offset
mov r4, r0 @ r0 = table start mov r4, r0 @ r0 = table start
add r5, r0, r1 @ r1 = table size add r5, r0, r1 @ r1 = table size
ldr r6, [r2, #4] @ get __pv_offset
bl __fixup_a_pv_table bl __fixup_a_pv_table
ldmfd sp!, {r4 - r7, pc} ldmfd sp!, {r4 - r7, pc}
ENDPROC(fixup_pv_table) ENDPROC(fixup_pv_table)
.align
2: .long __pv_phys_offset
.data .data
.globl __pv_phys_offset .globl __pv_phys_offset
.type __pv_phys_offset, %object .type __pv_phys_offset, %object
__pv_phys_offset: __pv_phys_offset:
.long 0 .quad 0
.size __pv_phys_offset, . - __pv_phys_offset .size __pv_phys_offset, . -__pv_phys_offset
.globl __pv_offset
.type __pv_offset, %object
__pv_offset: __pv_offset:
.long 0 .quad 0
.size __pv_offset, . -__pv_offset
#endif #endif
#include "head-common.S" #include "head-common.S"
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <asm/sections.h> #include <asm/sections.h>
#include <asm/smp_plat.h> #include <asm/smp_plat.h>
#include <asm/unwind.h> #include <asm/unwind.h>
#include <asm/opcodes.h>
#ifdef CONFIG_XIP_KERNEL #ifdef CONFIG_XIP_KERNEL
/* /*
...@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
Elf32_Sym *sym; Elf32_Sym *sym;
const char *symname; const char *symname;
s32 offset; s32 offset;
u32 tmp;
#ifdef CONFIG_THUMB2_KERNEL #ifdef CONFIG_THUMB2_KERNEL
u32 upper, lower, sign, j1, j2; u32 upper, lower, sign, j1, j2;
#endif #endif
...@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
case R_ARM_PC24: case R_ARM_PC24:
case R_ARM_CALL: case R_ARM_CALL:
case R_ARM_JUMP24: case R_ARM_JUMP24:
offset = (*(u32 *)loc & 0x00ffffff) << 2; offset = __mem_to_opcode_arm(*(u32 *)loc);
offset = (offset & 0x00ffffff) << 2;
if (offset & 0x02000000) if (offset & 0x02000000)
offset -= 0x04000000; offset -= 0x04000000;
...@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
} }
offset >>= 2; offset >>= 2;
offset &= 0x00ffffff;
*(u32 *)loc &= 0xff000000; *(u32 *)loc &= __opcode_to_mem_arm(0xff000000);
*(u32 *)loc |= offset & 0x00ffffff; *(u32 *)loc |= __opcode_to_mem_arm(offset);
break; break;
case R_ARM_V4BX: case R_ARM_V4BX:
...@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
* other bits to re-code instruction as * other bits to re-code instruction as
* MOV PC,Rm. * MOV PC,Rm.
*/ */
*(u32 *)loc &= 0xf000000f; *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f);
*(u32 *)loc |= 0x01a0f000; *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000);
break; break;
case R_ARM_PREL31: case R_ARM_PREL31:
...@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
case R_ARM_MOVW_ABS_NC: case R_ARM_MOVW_ABS_NC:
case R_ARM_MOVT_ABS: case R_ARM_MOVT_ABS:
offset = *(u32 *)loc; offset = tmp = __mem_to_opcode_arm(*(u32 *)loc);
offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff); offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff);
offset = (offset ^ 0x8000) - 0x8000; offset = (offset ^ 0x8000) - 0x8000;
...@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS) if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS)
offset >>= 16; offset >>= 16;
*(u32 *)loc &= 0xfff0f000; tmp &= 0xfff0f000;
*(u32 *)loc |= ((offset & 0xf000) << 4) | tmp |= ((offset & 0xf000) << 4) |
(offset & 0x0fff); (offset & 0x0fff);
*(u32 *)loc = __opcode_to_mem_arm(tmp);
break; break;
#ifdef CONFIG_THUMB2_KERNEL #ifdef CONFIG_THUMB2_KERNEL
case R_ARM_THM_CALL: case R_ARM_THM_CALL:
case R_ARM_THM_JUMP24: case R_ARM_THM_JUMP24:
upper = *(u16 *)loc; upper = __mem_to_opcode_thumb16(*(u16 *)loc);
lower = *(u16 *)(loc + 2); lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
/* /*
* 25 bit signed address range (Thumb-2 BL and B.W * 25 bit signed address range (Thumb-2 BL and B.W
...@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
sign = (offset >> 24) & 1; sign = (offset >> 24) & 1;
j1 = sign ^ (~(offset >> 23) & 1); j1 = sign ^ (~(offset >> 23) & 1);
j2 = sign ^ (~(offset >> 22) & 1); j2 = sign ^ (~(offset >> 22) & 1);
*(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) | upper = (u16)((upper & 0xf800) | (sign << 10) |
((offset >> 12) & 0x03ff)); ((offset >> 12) & 0x03ff));
*(u16 *)(loc + 2) = (u16)((lower & 0xd000) | lower = (u16)((lower & 0xd000) |
(j1 << 13) | (j2 << 11) | (j1 << 13) | (j2 << 11) |
((offset >> 1) & 0x07ff)); ((offset >> 1) & 0x07ff));
*(u16 *)loc = __opcode_to_mem_thumb16(upper);
*(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
break; break;
case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVW_ABS_NC:
case R_ARM_THM_MOVT_ABS: case R_ARM_THM_MOVT_ABS:
upper = *(u16 *)loc; upper = __mem_to_opcode_thumb16(*(u16 *)loc);
lower = *(u16 *)(loc + 2); lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
/* /*
* MOVT/MOVW instructions encoding in Thumb-2: * MOVT/MOVW instructions encoding in Thumb-2:
...@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, ...@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS) if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS)
offset >>= 16; offset >>= 16;
*(u16 *)loc = (u16)((upper & 0xfbf0) | upper = (u16)((upper & 0xfbf0) |
((offset & 0xf000) >> 12) | ((offset & 0xf000) >> 12) |
((offset & 0x0800) >> 1)); ((offset & 0x0800) >> 1));
*(u16 *)(loc + 2) = (u16)((lower & 0x8f00) | lower = (u16)((lower & 0x8f00) |
((offset & 0x0700) << 4) | ((offset & 0x0700) << 4) |
(offset & 0x00ff)); (offset & 0x00ff));
*(u16 *)loc = __opcode_to_mem_thumb16(upper);
*(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
break; break;
#endif #endif
......
...@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events, ...@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events,
struct perf_event *event) struct perf_event *event)
{ {
struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
struct pmu *leader_pmu = event->group_leader->pmu;
if (is_software_event(event)) if (is_software_event(event))
return 1; return 1;
if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF) if (event->state < PERF_EVENT_STATE_OFF)
return 1; return 1;
if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
......
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/perf_event.h>
#include <linux/bug.h>
#include <asm/perf_regs.h>
#include <asm/ptrace.h>
u64 perf_reg_value(struct pt_regs *regs, int idx)
{
if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
return 0;
return regs->uregs[idx];
}
#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
int perf_reg_validate(u64 mask)
{
if (!mask || mask & REG_RESERVED)
return -EINVAL;
return 0;
}
u64 perf_reg_abi(struct task_struct *task)
{
return PERF_SAMPLE_REGS_ABI_32;
}
...@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup); ...@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup);
#endif #endif
extern void paging_init(const struct machine_desc *desc); extern void paging_init(const struct machine_desc *desc);
extern void early_paging_init(const struct machine_desc *,
struct proc_info_list *);
extern void sanity_check_meminfo(void); extern void sanity_check_meminfo(void);
extern enum reboot_mode reboot_mode; extern enum reboot_mode reboot_mode;
extern void setup_dma_zone(const struct machine_desc *desc); extern void setup_dma_zone(const struct machine_desc *desc);
...@@ -888,6 +890,8 @@ void __init setup_arch(char **cmdline_p) ...@@ -888,6 +890,8 @@ void __init setup_arch(char **cmdline_p)
parse_early_param(); parse_early_param();
sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
sanity_check_meminfo(); sanity_check_meminfo();
arm_memblock_init(&meminfo, mdesc); arm_memblock_init(&meminfo, mdesc);
......
...@@ -21,29 +21,7 @@ ...@@ -21,29 +21,7 @@
#include <asm/unistd.h> #include <asm/unistd.h>
#include <asm/vfp.h> #include <asm/vfp.h>
/* extern const unsigned long sigreturn_codes[7];
* For ARM syscalls, we encode the syscall number into the instruction.
*/
#define SWI_SYS_SIGRETURN (0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE))
#define SWI_SYS_RT_SIGRETURN (0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE))
/*
* With EABI, the syscall number has to be loaded into r7.
*/
#define MOV_R7_NR_SIGRETURN (0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE))
#define MOV_R7_NR_RT_SIGRETURN (0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
/*
* For Thumb syscalls, we pass the syscall number via r7. We therefore
* need two 16-bit instructions.
*/
#define SWI_THUMB_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE))
#define SWI_THUMB_RT_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
static const unsigned long sigreturn_codes[7] = {
MOV_R7_NR_SIGRETURN, SWI_SYS_SIGRETURN, SWI_THUMB_SIGRETURN,
MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN,
};
static unsigned long signal_return_offset; static unsigned long signal_return_offset;
......
/*
* sigreturn_codes.S - code sinpets for sigreturn syscalls
*
* Created by: Victor Kamensky, 2013-08-13
* Copyright: (C) 2013 Linaro Limited
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <asm/unistd.h>
/*
* For ARM syscalls, we encode the syscall number into the instruction.
* With EABI, the syscall number has to be loaded into r7. As result
* ARM syscall sequence snippet will have move and svc in .arm encoding
*
* For Thumb syscalls, we pass the syscall number via r7. We therefore
* need two 16-bit instructions in .thumb encoding
*
* Please note sigreturn_codes code are not executed in place. Instead
* they just copied by kernel into appropriate places. Code inside of
* arch/arm/kernel/signal.c is very sensitive to layout of these code
* snippets.
*/
#if __LINUX_ARM_ARCH__ <= 4
/*
* Note we manually set minimally required arch that supports
* required thumb opcodes for early arch versions. It is OK
* for this file to be used in combination with other
* lower arch variants, since these code snippets are only
* used as input data.
*/
.arch armv4t
#endif
.section .rodata
.global sigreturn_codes
.type sigreturn_codes, #object
.arm
sigreturn_codes:
/* ARM sigreturn syscall code snippet */
mov r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
swi #(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)
/* Thumb sigreturn syscall code snippet */
.thumb
movs r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
swi #0
/* ARM sigreturn_rt syscall code snippet */
.arm
mov r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
swi #(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)
/* Thumb sigreturn_rt syscall code snippet */
.thumb
movs r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
swi #0
/*
* Note on addtional space: setup_return in signal.c
* algorithm uses two words copy regardless whether
* it is thumb case or not, so we need additional
* word after real last entry.
*/
.arm
.space 4
.size sigreturn_codes, . - sigreturn_codes
...@@ -55,6 +55,7 @@ ...@@ -55,6 +55,7 @@
* specific registers and some other data for resume. * specific registers and some other data for resume.
* r0 = suspend function arg0 * r0 = suspend function arg0
* r1 = suspend function * r1 = suspend function
* r2 = MPIDR value the resuming CPU will use
*/ */
ENTRY(__cpu_suspend) ENTRY(__cpu_suspend)
stmfd sp!, {r4 - r11, lr} stmfd sp!, {r4 - r11, lr}
...@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend) ...@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend)
mov r5, sp @ current virtual SP mov r5, sp @ current virtual SP
add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn
sub sp, sp, r4 @ allocate CPU state on stack sub sp, sp, r4 @ allocate CPU state on stack
stmfd sp!, {r0, r1} @ save suspend func arg and pointer
add r0, sp, #8 @ save pointer to save block
mov r1, r4 @ size of save block
mov r2, r5 @ virtual SP
ldr r3, =sleep_save_sp ldr r3, =sleep_save_sp
stmfd sp!, {r0, r1} @ save suspend func arg and pointer
ldr r3, [r3, #SLEEP_SAVE_SP_VIRT] ldr r3, [r3, #SLEEP_SAVE_SP_VIRT]
ALT_SMP(mrc p15, 0, r9, c0, c0, 5) ALT_SMP(ldr r0, =mpidr_hash)
ALT_UP_B(1f) ALT_UP_B(1f)
ldr r8, =mpidr_hash /* This ldmia relies on the memory layout of the mpidr_hash struct */
/* ldmia r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts
* This ldmia relies on the memory layout of the mpidr_hash compute_mpidr_hash r0, r6, r7, r8, r2, r1
* struct mpidr_hash. add r3, r3, r0, lsl #2
*/ 1: mov r2, r5 @ virtual SP
ldmia r8, {r4-r7} @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts mov r1, r4 @ size of save block
compute_mpidr_hash lr, r5, r6, r7, r9, r4 add r0, sp, #8 @ pointer to save block
add r3, r3, lr, lsl #2
1:
bl __cpu_suspend_save bl __cpu_suspend_save
adr lr, BSYM(cpu_suspend_abort) adr lr, BSYM(cpu_suspend_abort)
ldmfd sp!, {r0, pc} @ call suspend fn ldmfd sp!, {r0, pc} @ call suspend fn
...@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu) ...@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu)
.data .data
.align .align
ENTRY(cpu_resume) ENTRY(cpu_resume)
ARM_BE8(setend be) @ ensure we are in BE mode
mov r1, #0 mov r1, #0
ALT_SMP(mrc p15, 0, r0, c0, c0, 5) ALT_SMP(mrc p15, 0, r0, c0, c0, 5)
ALT_UP_B(1f) ALT_UP_B(1f)
......
...@@ -68,6 +68,7 @@ enum ipi_msg_type { ...@@ -68,6 +68,7 @@ enum ipi_msg_type {
IPI_CALL_FUNC_SINGLE, IPI_CALL_FUNC_SINGLE,
IPI_CPU_STOP, IPI_CPU_STOP,
IPI_IRQ_WORK, IPI_IRQ_WORK,
IPI_COMPLETION,
}; };
static DECLARE_COMPLETION(cpu_running); static DECLARE_COMPLETION(cpu_running);
...@@ -82,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops) ...@@ -82,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops)
static unsigned long get_arch_pgd(pgd_t *pgd) static unsigned long get_arch_pgd(pgd_t *pgd)
{ {
phys_addr_t pgdir = virt_to_phys(pgd); phys_addr_t pgdir = virt_to_idmap(pgd);
BUG_ON(pgdir & ARCH_PGD_MASK); BUG_ON(pgdir & ARCH_PGD_MASK);
return pgdir >> ARCH_PGD_SHIFT; return pgdir >> ARCH_PGD_SHIFT;
} }
...@@ -467,6 +468,7 @@ static const char *ipi_types[NR_IPI] = { ...@@ -467,6 +468,7 @@ static const char *ipi_types[NR_IPI] = {
S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
S(IPI_CPU_STOP, "CPU stop interrupts"), S(IPI_CPU_STOP, "CPU stop interrupts"),
S(IPI_IRQ_WORK, "IRQ work interrupts"), S(IPI_IRQ_WORK, "IRQ work interrupts"),
S(IPI_COMPLETION, "completion interrupts"),
}; };
void show_ipi_list(struct seq_file *p, int prec) void show_ipi_list(struct seq_file *p, int prec)
...@@ -526,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu) ...@@ -526,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu)
cpu_relax(); cpu_relax();
} }
static DEFINE_PER_CPU(struct completion *, cpu_completion);
int register_ipi_completion(struct completion *completion, int cpu)
{
per_cpu(cpu_completion, cpu) = completion;
return IPI_COMPLETION;
}
static void ipi_complete(unsigned int cpu)
{
complete(per_cpu(cpu_completion, cpu));
}
/* /*
* Main handler for inter-processor interrupts * Main handler for inter-processor interrupts
*/ */
...@@ -584,6 +599,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs) ...@@ -584,6 +599,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
break; break;
#endif #endif
case IPI_COMPLETION:
irq_enter();
ipi_complete(cpu);
irq_exit();
break;
default: default:
printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n", printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
cpu, ipinr); cpu, ipinr);
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
*/ */
unsigned int __init scu_get_core_count(void __iomem *scu_base) unsigned int __init scu_get_core_count(void __iomem *scu_base)
{ {
unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG); unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG);
return (ncores & 0x03) + 1; return (ncores & 0x03) + 1;
} }
...@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base) ...@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base)
#ifdef CONFIG_ARM_ERRATA_764369 #ifdef CONFIG_ARM_ERRATA_764369
/* Cortex-A9 only */ /* Cortex-A9 only */
if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) { if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) {
scu_ctrl = __raw_readl(scu_base + 0x30); scu_ctrl = readl_relaxed(scu_base + 0x30);
if (!(scu_ctrl & 1)) if (!(scu_ctrl & 1))
__raw_writel(scu_ctrl | 0x1, scu_base + 0x30); writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30);
} }
#endif #endif
scu_ctrl = __raw_readl(scu_base + SCU_CTRL); scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
/* already enabled? */ /* already enabled? */
if (scu_ctrl & 1) if (scu_ctrl & 1)
return; return;
scu_ctrl |= 1; scu_ctrl |= 1;
__raw_writel(scu_ctrl, scu_base + SCU_CTRL); writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
/* /*
* Ensure that the data accessed by CPU0 before the SCU was * Ensure that the data accessed by CPU0 before the SCU was
...@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode) ...@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode)
if (mode > 3 || mode == 1 || cpu > 3) if (mode > 3 || mode == 1 || cpu > 3)
return -EINVAL; return -EINVAL;
val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03; val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
val |= mode; val |= mode;
__raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu); writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu);
return 0; return 0;
} }
...@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode, ...@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode,
case CLOCK_EVT_MODE_PERIODIC: case CLOCK_EVT_MODE_PERIODIC:
ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE
| TWD_TIMER_CONTROL_PERIODIC; | TWD_TIMER_CONTROL_PERIODIC;
__raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ), writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
twd_base + TWD_TIMER_LOAD); twd_base + TWD_TIMER_LOAD);
break; break;
case CLOCK_EVT_MODE_ONESHOT: case CLOCK_EVT_MODE_ONESHOT:
...@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode, ...@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode,
ctrl = 0; ctrl = 0;
} }
__raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
} }
static int twd_set_next_event(unsigned long evt, static int twd_set_next_event(unsigned long evt,
struct clock_event_device *unused) struct clock_event_device *unused)
{ {
unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL); unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL);
ctrl |= TWD_TIMER_CONTROL_ENABLE; ctrl |= TWD_TIMER_CONTROL_ENABLE;
__raw_writel(evt, twd_base + TWD_TIMER_COUNTER); writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER);
__raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
return 0; return 0;
} }
...@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt, ...@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt,
*/ */
static int twd_timer_ack(void) static int twd_timer_ack(void)
{ {
if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) { if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) {
__raw_writel(1, twd_base + TWD_TIMER_INTSTAT); writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT);
return 1; return 1;
} }
...@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void) ...@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void)
waitjiffies += 5; waitjiffies += 5;
/* enable, no interrupt or reload */ /* enable, no interrupt or reload */
__raw_writel(0x1, twd_base + TWD_TIMER_CONTROL); writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL);
/* maximum value */ /* maximum value */
__raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER); writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
while (get_jiffies_64() < waitjiffies) while (get_jiffies_64() < waitjiffies)
udelay(10); udelay(10);
count = __raw_readl(twd_base + TWD_TIMER_COUNTER); count = readl_relaxed(twd_base + TWD_TIMER_COUNTER);
twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5); twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
...@@ -277,7 +277,7 @@ static void twd_timer_setup(void) ...@@ -277,7 +277,7 @@ static void twd_timer_setup(void)
* bother with the below. * bother with the below.
*/ */
if (per_cpu(percpu_setup_called, cpu)) { if (per_cpu(percpu_setup_called, cpu)) {
__raw_writel(0, twd_base + TWD_TIMER_CONTROL); writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
clockevents_register_device(clk); clockevents_register_device(clk);
enable_percpu_irq(clk->irq, 0); enable_percpu_irq(clk->irq, 0);
return; return;
...@@ -290,7 +290,7 @@ static void twd_timer_setup(void) ...@@ -290,7 +290,7 @@ static void twd_timer_setup(void)
* The following is done once per CPU the first time .setup() is * The following is done once per CPU the first time .setup() is
* called. * called.
*/ */
__raw_writel(0, twd_base + TWD_TIMER_CONTROL); writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
clk->name = "local_timer"; clk->name = "local_timer";
clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include <asm/suspend.h> #include <asm/suspend.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
extern int __cpu_suspend(unsigned long, int (*)(unsigned long)); extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid);
extern void cpu_resume_mmu(void); extern void cpu_resume_mmu(void);
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
...@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void); ...@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void);
int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
{ {
struct mm_struct *mm = current->active_mm; struct mm_struct *mm = current->active_mm;
u32 __mpidr = cpu_logical_map(smp_processor_id());
int ret; int ret;
if (!idmap_pgd) if (!idmap_pgd)
...@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) ...@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
* resume (indicated by a zero return code), we need to switch * resume (indicated by a zero return code), we need to switch
* back to the correct page tables. * back to the correct page tables.
*/ */
ret = __cpu_suspend(arg, fn); ret = __cpu_suspend(arg, fn, __mpidr);
if (ret == 0) { if (ret == 0) {
cpu_switch_mm(mm->pgd, mm); cpu_switch_mm(mm->pgd, mm);
local_flush_bp_all(); local_flush_bp_all();
...@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) ...@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
#else #else
int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
{ {
return __cpu_suspend(arg, fn); u32 __mpidr = cpu_logical_map(smp_processor_id());
return __cpu_suspend(arg, fn, __mpidr);
} }
#define idmap_pgd NULL #define idmap_pgd NULL
#endif #endif
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include <asm/unwind.h> #include <asm/unwind.h>
#include <asm/tls.h> #include <asm/tls.h>
#include <asm/system_misc.h> #include <asm/system_misc.h>
#include <asm/opcodes.h>
static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" }; static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" };
...@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs, ...@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs,
int is_valid_bugaddr(unsigned long pc) int is_valid_bugaddr(unsigned long pc)
{ {
#ifdef CONFIG_THUMB2_KERNEL #ifdef CONFIG_THUMB2_KERNEL
unsigned short bkpt; u16 bkpt;
u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE);
#else #else
unsigned long bkpt; u32 bkpt;
u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE);
#endif #endif
if (probe_kernel_address((unsigned *)pc, bkpt)) if (probe_kernel_address((unsigned *)pc, bkpt))
return 0; return 0;
return bkpt == BUG_INSTR_VALUE; return bkpt == insn;
} }
#endif #endif
...@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs) ...@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
if (processor_mode(regs) == SVC_MODE) { if (processor_mode(regs) == SVC_MODE) {
#ifdef CONFIG_THUMB2_KERNEL #ifdef CONFIG_THUMB2_KERNEL
if (thumb_mode(regs)) { if (thumb_mode(regs)) {
instr = ((u16 *)pc)[0]; instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]);
if (is_wide_instruction(instr)) { if (is_wide_instruction(instr)) {
instr <<= 16; u16 inst2;
instr |= ((u16 *)pc)[1]; inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]);
instr = __opcode_thumb32_compose(instr, inst2);
} }
} else } else
#endif #endif
instr = *(u32 *) pc; instr = __mem_to_opcode_arm(*(u32 *) pc);
} else if (thumb_mode(regs)) { } else if (thumb_mode(regs)) {
if (get_user(instr, (u16 __user *)pc)) if (get_user(instr, (u16 __user *)pc))
goto die_sig; goto die_sig;
instr = __mem_to_opcode_thumb16(instr);
if (is_wide_instruction(instr)) { if (is_wide_instruction(instr)) {
unsigned int instr2; unsigned int instr2;
if (get_user(instr2, (u16 __user *)pc+1)) if (get_user(instr2, (u16 __user *)pc+1))
goto die_sig; goto die_sig;
instr <<= 16; instr2 = __mem_to_opcode_thumb16(instr2);
instr |= instr2; instr = __opcode_thumb32_compose(instr, instr2);
} }
} else if (get_user(instr, (u32 __user *)pc)) { } else if (get_user(instr, (u32 __user *)pc)) {
instr = __mem_to_opcode_arm(instr);
goto die_sig; goto die_sig;
} }
......
...@@ -10,6 +10,11 @@ UNWIND( .fnstart ) ...@@ -10,6 +10,11 @@ UNWIND( .fnstart )
and r3, r0, #31 @ Get bit offset and r3, r0, #31 @ Get bit offset
mov r0, r0, lsr #5 mov r0, r0, lsr #5
add r1, r1, r0, lsl #2 @ Get word offset add r1, r1, r0, lsl #2 @ Get word offset
#if __LINUX_ARM_ARCH__ >= 7
.arch_extension mp
ALT_SMP(W(pldw) [r1])
ALT_UP(W(nop))
#endif
mov r3, r2, lsl r3 mov r3, r2, lsl r3
1: ldrex r2, [r1] 1: ldrex r2, [r1]
\instr r2, r2, r3 \instr r2, r2, r3
......
...@@ -4,6 +4,7 @@ config ARCH_HIGHBANK ...@@ -4,6 +4,7 @@ config ARCH_HIGHBANK
select ARCH_HAS_CPUFREQ select ARCH_HAS_CPUFREQ
select ARCH_HAS_HOLES_MEMORYMODEL select ARCH_HAS_HOLES_MEMORYMODEL
select ARCH_HAS_OPP select ARCH_HAS_OPP
select ARCH_SUPPORTS_BIG_ENDIAN
select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_OPTIONAL_GPIOLIB
select ARM_AMBA select ARM_AMBA
select ARM_ERRATA_764369 select ARM_ERRATA_764369
......
if ARCH_IXP4XX if ARCH_IXP4XX
config ARCH_SUPPORTS_BIG_ENDIAN
bool
default y
menu "Intel IXP4xx Implementation Options" menu "Intel IXP4xx Implementation Options"
comment "IXP4xx Platforms" comment "IXP4xx Platforms"
......
config ARCH_MVEBU config ARCH_MVEBU
bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7 bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7
select ARCH_SUPPORTS_BIG_ENDIAN
select CLKSRC_MMIO select CLKSRC_MMIO
select COMMON_CLK select COMMON_CLK
select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
#define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0 #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0
#define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4 #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4
#include <asm/assembler.h>
.text .text
/* /*
* r0: Coherency fabric base register address * r0: Coherency fabric base register address
...@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent) ...@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent)
/* Create bit by cpu index */ /* Create bit by cpu index */
mov r3, #(1 << 24) mov r3, #(1 << 24)
lsl r1, r3, r1 lsl r1, r3, r1
ARM_BE8(rev r1, r1)
/* Add CPU to SMP group - Atomic */ /* Add CPU to SMP group - Atomic */
add r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET add r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET
......
...@@ -21,12 +21,16 @@ ...@@ -21,12 +21,16 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/init.h> #include <linux/init.h>
#include <asm/assembler.h>
/* /*
* Armada XP specific entry point for secondary CPUs. * Armada XP specific entry point for secondary CPUs.
* We add the CPU to the coherency fabric and then jump to secondary * We add the CPU to the coherency fabric and then jump to secondary
* startup * startup
*/ */
ENTRY(armada_xp_secondary_startup) ENTRY(armada_xp_secondary_startup)
ARM_BE8(setend be ) @ go BE8 if entered LE
/* Get coherency fabric base physical address */ /* Get coherency fabric base physical address */
adr r0, 1f adr r0, 1f
ldr r1, [r0] ldr r1, [r0]
......
config ARCH_VEXPRESS config ARCH_VEXPRESS
bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7 bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7
select ARCH_REQUIRE_GPIOLIB select ARCH_REQUIRE_GPIOLIB
select ARCH_SUPPORTS_BIG_ENDIAN
select ARM_AMBA select ARM_AMBA
select ARM_GIC select ARM_GIC
select ARM_TIMER_SP804 select ARM_TIMER_SP804
......
...@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS ...@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS
help help
This option allows the use of custom mandatory barriers This option allows the use of custom mandatory barriers
included via the mach/barriers.h file. included via the mach/barriers.h file.
config ARCH_SUPPORTS_BIG_ENDIAN
bool
help
This option specifies the architecture can support big endian
operation.
...@@ -38,9 +38,8 @@ ENTRY(v6_early_abort) ...@@ -38,9 +38,8 @@ ENTRY(v6_early_abort)
bne do_DataAbort bne do_DataAbort
bic r1, r1, #1 << 11 @ clear bit 11 of FSR bic r1, r1, #1 << 11 @ clear bit 11 of FSR
ldr r3, [r4] @ read aborted ARM instruction ldr r3, [r4] @ read aborted ARM instruction
#ifdef CONFIG_CPU_ENDIAN_BE8 ARM_BE8(rev r3, r3)
rev r3, r3
#endif
do_ldrd_abort tmp=ip, insn=r3 do_ldrd_abort tmp=ip, insn=r3
tst r3, #1 << 20 @ L = 0 -> write tst r3, #1 << 20 @ L = 0 -> write
orreq r1, r1, #1 << 11 @ yes. orreq r1, r1, #1 << 11 @ yes.
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <asm/cp15.h> #include <asm/cp15.h>
#include <asm/system_info.h> #include <asm/system_info.h>
#include <asm/unaligned.h> #include <asm/unaligned.h>
#include <asm/opcodes.h>
#include "fault.h" #include "fault.h"
...@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) ...@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (thumb_mode(regs)) { if (thumb_mode(regs)) {
u16 *ptr = (u16 *)(instrptr & ~1); u16 *ptr = (u16 *)(instrptr & ~1);
fault = probe_kernel_address(ptr, tinstr); fault = probe_kernel_address(ptr, tinstr);
tinstr = __mem_to_opcode_thumb16(tinstr);
if (!fault) { if (!fault) {
if (cpu_architecture() >= CPU_ARCH_ARMv7 && if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
IS_T32(tinstr)) { IS_T32(tinstr)) {
/* Thumb-2 32-bit */ /* Thumb-2 32-bit */
u16 tinst2 = 0; u16 tinst2 = 0;
fault = probe_kernel_address(ptr + 1, tinst2); fault = probe_kernel_address(ptr + 1, tinst2);
instr = (tinstr << 16) | tinst2; tinst2 = __mem_to_opcode_thumb16(tinst2);
instr = __opcode_thumb32_compose(tinstr, tinst2);
thumb2_32b = 1; thumb2_32b = 1;
} else { } else {
isize = 2; isize = 2;
instr = thumb2arm(tinstr); instr = thumb2arm(tinstr);
} }
} }
} else } else {
fault = probe_kernel_address(instrptr, instr); fault = probe_kernel_address(instrptr, instr);
instr = __mem_to_opcode_arm(instr);
}
if (fault) { if (fault) {
type = TYPE_FAULT; type = TYPE_FAULT;
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <asm/system_info.h> #include <asm/system_info.h>
pgd_t *idmap_pgd; pgd_t *idmap_pgd;
phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
#ifdef CONFIG_ARM_LPAE #ifdef CONFIG_ARM_LPAE
static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
...@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start, ...@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
unsigned long addr, end; unsigned long addr, end;
unsigned long next; unsigned long next;
addr = virt_to_phys(text_start); addr = virt_to_idmap(text_start);
end = virt_to_phys(text_end); end = virt_to_idmap(text_end);
pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
...@@ -90,8 +92,6 @@ static int __init init_static_idmap(void) ...@@ -90,8 +92,6 @@ static int __init init_static_idmap(void)
if (!idmap_pgd) if (!idmap_pgd)
return -ENOMEM; return -ENOMEM;
pr_info("Setting up static identity map for 0x%p - 0x%p\n",
__idmap_text_start, __idmap_text_end);
identity_mapping_add(idmap_pgd, __idmap_text_start, identity_mapping_add(idmap_pgd, __idmap_text_start,
__idmap_text_end, 0); __idmap_text_end, 0);
......
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
#include <asm/highmem.h> #include <asm/highmem.h>
#include <asm/system_info.h> #include <asm/system_info.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/procinfo.h>
#include <asm/memory.h>
#include <asm/mach/arch.h> #include <asm/mach/arch.h>
#include <asm/mach/map.h> #include <asm/mach/map.h>
...@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void) ...@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void)
} }
} }
#ifdef CONFIG_ARM_LPAE
/*
* early_paging_init() recreates boot time page table setup, allowing machines
* to switch over to a high (>4G) address space on LPAE systems
*/
void __init early_paging_init(const struct machine_desc *mdesc,
struct proc_info_list *procinfo)
{
pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
unsigned long map_start, map_end;
pgd_t *pgd0, *pgdk;
pud_t *pud0, *pudk, *pud_start;
pmd_t *pmd0, *pmdk;
phys_addr_t phys;
int i;
if (!(mdesc->init_meminfo))
return;
/* remap kernel code and data */
map_start = init_mm.start_code;
map_end = init_mm.brk;
/* get a handle on things... */
pgd0 = pgd_offset_k(0);
pud_start = pud0 = pud_offset(pgd0, 0);
pmd0 = pmd_offset(pud0, 0);
pgdk = pgd_offset_k(map_start);
pudk = pud_offset(pgdk, map_start);
pmdk = pmd_offset(pudk, map_start);
mdesc->init_meminfo();
/* Run the patch stub to update the constants */
fixup_pv_table(&__pv_table_begin,
(&__pv_table_end - &__pv_table_begin) << 2);
/*
* Cache cleaning operations for self-modifying code
* We should clean the entries by MVA but running a
* for loop over every pv_table entry pointer would
* just complicate the code.
*/
flush_cache_louis();
dsb();
isb();
/* remap level 1 table */
for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
set_pud(pud0,
__pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
pmd0 += PTRS_PER_PMD;
}
/* remap pmds for kernel mapping */
phys = __pa(map_start) & PMD_MASK;
do {
*pmdk++ = __pmd(phys | pmdprot);
phys += PMD_SIZE;
} while (phys < map_end);
flush_cache_all();
cpu_switch_mm(pgd0, &init_mm);
cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
local_flush_bp_all();
local_flush_tlb_all();
}
#else
void __init early_paging_init(const struct machine_desc *mdesc,
struct proc_info_list *procinfo)
{
if (mdesc->init_meminfo)
mdesc->init_meminfo();
}
#endif
/* /*
* paging_init() sets up the page tables, initialises the zone memory * paging_init() sets up the page tables, initialises the zone memory
* maps, and sets up the zero page, bad page and bad page tables. * maps, and sets up the zero page, bad page and bad page tables.
......
...@@ -295,6 +295,15 @@ void __init sanity_check_meminfo(void) ...@@ -295,6 +295,15 @@ void __init sanity_check_meminfo(void)
high_memory = __va(end - 1) + 1; high_memory = __va(end - 1) + 1;
} }
/*
* early_paging_init() recreates boot time page table setup, allowing machines
* to switch over to a high (>4G) address space on LPAE systems
*/
void __init early_paging_init(const struct machine_desc *mdesc,
struct proc_info_list *procinfo)
{
}
/* /*
* paging_init() sets up the page tables, initialises the zone memory * paging_init() sets up the page tables, initialises the zone memory
* maps, and sets up the zero page, bad page and bad page tables. * maps, and sets up the zero page, bad page and bad page tables.
......
...@@ -220,9 +220,7 @@ __v6_setup: ...@@ -220,9 +220,7 @@ __v6_setup:
#endif /* CONFIG_MMU */ #endif /* CONFIG_MMU */
adr r5, v6_crval adr r5, v6_crval
ldmia r5, {r5, r6} ldmia r5, {r5, r6}
#ifdef CONFIG_CPU_ENDIAN_BE8 ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables
orr r6, r6, #1 << 25 @ big-endian page tables
#endif
mrc p15, 0, r0, c1, c0, 0 @ read control register mrc p15, 0, r0, c1, c0, 0 @ read control register
bic r0, r0, r5 @ clear bits them bic r0, r0, r5 @ clear bits them
orr r0, r0, r6 @ set them orr r0, r0, r6 @ set them
......
...@@ -367,9 +367,7 @@ __v7_setup: ...@@ -367,9 +367,7 @@ __v7_setup:
#endif #endif
adr r5, v7_crval adr r5, v7_crval
ldmia r5, {r5, r6} ldmia r5, {r5, r6}
#ifdef CONFIG_CPU_ENDIAN_BE8 ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables
orr r6, r6, #1 << 25 @ big-endian page tables
#endif
#ifdef CONFIG_SWP_EMULATE #ifdef CONFIG_SWP_EMULATE
orr r5, r5, #(1 << 10) @ set SW bit in "clear" orr r5, r5, #(1 << 10) @ set SW bit in "clear"
bic r6, r6, #(1 << 10) @ clear it in "mmuset" bic r6, r6, #(1 << 10) @ clear it in "mmuset"
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/if_vlan.h> #include <linux/if_vlan.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
#include <asm/hwcap.h> #include <asm/hwcap.h>
#include <asm/opcodes.h>
#include "bpf_jit_32.h" #include "bpf_jit_32.h"
...@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor) ...@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor)
static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
{ {
inst |= (cond << 28);
inst = __opcode_to_mem_arm(inst);
if (ctx->target != NULL) if (ctx->target != NULL)
ctx->target[ctx->idx] = inst | (cond << 28); ctx->target[ctx->idx] = inst;
ctx->idx++; ctx->idx++;
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
*/ */
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/init.h> #include <linux/init.h>
#include <asm/assembler.h>
/* /*
* Realview/Versatile Express specific entry point for secondary CPUs. * Realview/Versatile Express specific entry point for secondary CPUs.
...@@ -17,6 +18,7 @@ ...@@ -17,6 +18,7 @@
* until we're ready for them to initialise. * until we're ready for them to initialise.
*/ */
ENTRY(versatile_secondary_startup) ENTRY(versatile_secondary_startup)
ARM_BE8(setend be)
mrc p15, 0, r0, c0, c0, 5 mrc p15, 0, r0, c0, c0, 5
bic r0, #0xff000000 bic r0, #0xff000000
adr r4, 1f adr r4, 1f
......
...@@ -776,6 +776,22 @@ config CRYPTO_AES_ARM ...@@ -776,6 +776,22 @@ config CRYPTO_AES_ARM
See <http://csrc.nist.gov/encryption/aes/> for more information. See <http://csrc.nist.gov/encryption/aes/> for more information.
config CRYPTO_AES_ARM_BS
tristate "Bit sliced AES using NEON instructions"
depends on ARM && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM
select CRYPTO_ABLK_HELPER
help
Use a faster and more secure NEON based implementation of AES in CBC,
CTR and XTS modes
Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
and for XTS mode encryption, CBC and XTS mode decryption speedup is
around 25%. (CBC encryption speed is not affected by this driver.)
This implementation does not rely on any lookup tables so it is
believed to be invulnerable to cache timing attacks.
config CRYPTO_ANUBIS config CRYPTO_ANUBIS
tristate "Anubis cipher algorithm" tristate "Anubis cipher algorithm"
select CRYPTO_ALGAPI select CRYPTO_ALGAPI
......
...@@ -280,7 +280,7 @@ asmlinkage void __naked cci_enable_port_for_self(void) ...@@ -280,7 +280,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
/* Enable the CCI port */ /* Enable the CCI port */
" ldr r0, [r0, %[offsetof_port_phys]] \n" " ldr r0, [r0, %[offsetof_port_phys]] \n"
" mov r3, #"__stringify(CCI_ENABLE_REQ)" \n" " mov r3, %[cci_enable_req]\n"
" str r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n" " str r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n"
/* poll the status reg for completion */ /* poll the status reg for completion */
...@@ -288,7 +288,7 @@ asmlinkage void __naked cci_enable_port_for_self(void) ...@@ -288,7 +288,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
" ldr r0, [r1] \n" " ldr r0, [r1] \n"
" ldr r0, [r0, r1] @ cci_ctrl_base \n" " ldr r0, [r0, r1] @ cci_ctrl_base \n"
"4: ldr r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n" "4: ldr r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n"
" tst r1, #1 \n" " tst r1, %[cci_control_status_bits] \n"
" bne 4b \n" " bne 4b \n"
" mov r0, #0 \n" " mov r0, #0 \n"
...@@ -301,6 +301,8 @@ asmlinkage void __naked cci_enable_port_for_self(void) ...@@ -301,6 +301,8 @@ asmlinkage void __naked cci_enable_port_for_self(void)
"7: .word cci_ctrl_phys - . \n" "7: .word cci_ctrl_phys - . \n"
: : : :
[sizeof_cpu_port] "i" (sizeof(cpu_port)), [sizeof_cpu_port] "i" (sizeof(cpu_port)),
[cci_enable_req] "i" cpu_to_le32(CCI_ENABLE_REQ),
[cci_control_status_bits] "i" cpu_to_le32(1),
#ifndef __ARMEB__ #ifndef __ARMEB__
[offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)), [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)),
#else #else
......
...@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, ...@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids) if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
return -EINVAL; return -EINVAL;
raw_spin_lock(&irq_controller_lock);
mask = 0xff << shift; mask = 0xff << shift;
bit = gic_cpu_map[cpu] << shift; bit = gic_cpu_map[cpu] << shift;
raw_spin_lock(&irq_controller_lock);
val = readl_relaxed(reg) & ~mask; val = readl_relaxed(reg) & ~mask;
writel_relaxed(val | bit, reg); writel_relaxed(val | bit, reg);
raw_spin_unlock(&irq_controller_lock); raw_spin_unlock(&irq_controller_lock);
...@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic) ...@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
{ {
int cpu; int cpu;
unsigned long map = 0; unsigned long flags, map = 0;
raw_spin_lock_irqsave(&irq_controller_lock, flags);
/* Convert our logical CPU mask into a physical one. */ /* Convert our logical CPU mask into a physical one. */
for_each_cpu(cpu, mask) for_each_cpu(cpu, mask)
...@@ -666,7 +667,149 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) ...@@ -666,7 +667,149 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
/* this always happens on GIC0 */ /* this always happens on GIC0 */
writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
}
#endif
#ifdef CONFIG_BL_SWITCHER
/*
* gic_send_sgi - send a SGI directly to given CPU interface number
*
* cpu_id: the ID for the destination CPU interface
* irq: the IPI number to send a SGI for
*/
void gic_send_sgi(unsigned int cpu_id, unsigned int irq)
{
BUG_ON(cpu_id >= NR_GIC_CPU_IF);
cpu_id = 1 << cpu_id;
/* this always happens on GIC0 */
writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
}
/*
* gic_get_cpu_id - get the CPU interface ID for the specified CPU
*
* @cpu: the logical CPU number to get the GIC ID for.
*
* Return the CPU interface ID for the given logical CPU number,
* or -1 if the CPU number is too large or the interface ID is
* unknown (more than one bit set).
*/
int gic_get_cpu_id(unsigned int cpu)
{
unsigned int cpu_bit;
if (cpu >= NR_GIC_CPU_IF)
return -1;
cpu_bit = gic_cpu_map[cpu];
if (cpu_bit & (cpu_bit - 1))
return -1;
return __ffs(cpu_bit);
} }
/*
* gic_migrate_target - migrate IRQs to another CPU interface
*
* @new_cpu_id: the CPU target ID to migrate IRQs to
*
* Migrate all peripheral interrupts with a target matching the current CPU
* to the interface corresponding to @new_cpu_id. The CPU interface mapping
* is also updated. Targets to other CPU interfaces are unchanged.
* This must be called with IRQs locally disabled.
*/
void gic_migrate_target(unsigned int new_cpu_id)
{
unsigned int cur_cpu_id, gic_irqs, gic_nr = 0;
void __iomem *dist_base;
int i, ror_val, cpu = smp_processor_id();
u32 val, cur_target_mask, active_mask;
if (gic_nr >= MAX_GIC_NR)
BUG();
dist_base = gic_data_dist_base(&gic_data[gic_nr]);
if (!dist_base)
return;
gic_irqs = gic_data[gic_nr].gic_irqs;
cur_cpu_id = __ffs(gic_cpu_map[cpu]);
cur_target_mask = 0x01010101 << cur_cpu_id;
ror_val = (cur_cpu_id - new_cpu_id) & 31;
raw_spin_lock(&irq_controller_lock);
/* Update the target interface for this logical CPU */
gic_cpu_map[cpu] = 1 << new_cpu_id;
/*
* Find all the peripheral interrupts targetting the current
* CPU interface and migrate them to the new CPU interface.
* We skip DIST_TARGET 0 to 7 as they are read-only.
*/
for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) {
val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
active_mask = val & cur_target_mask;
if (active_mask) {
val &= ~active_mask;
val |= ror32(active_mask, ror_val);
writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4);
}
}
raw_spin_unlock(&irq_controller_lock);
/*
* Now let's migrate and clear any potential SGIs that might be
* pending for us (cur_cpu_id). Since GIC_DIST_SGI_PENDING_SET
* is a banked register, we can only forward the SGI using
* GIC_DIST_SOFTINT. The original SGI source is lost but Linux
* doesn't use that information anyway.
*
* For the same reason we do not adjust SGI source information
* for previously sent SGIs by us to other CPUs either.
*/
for (i = 0; i < 16; i += 4) {
int j;
val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i);
if (!val)
continue;
writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i);
for (j = i; j < i + 4; j++) {
if (val & 0xff)
writel_relaxed((1 << (new_cpu_id + 16)) | j,
dist_base + GIC_DIST_SOFTINT);
val >>= 8;
}
}
}
/*
* gic_get_sgir_physaddr - get the physical address for the SGI register
*
* REturn the physical address of the SGI register to be used
* by some early assembly code when the kernel is not yet available.
*/
static unsigned long gic_dist_physaddr;
unsigned long gic_get_sgir_physaddr(void)
{
if (!gic_dist_physaddr)
return 0;
return gic_dist_physaddr + GIC_DIST_SOFTINT;
}
void __init gic_init_physaddr(struct device_node *node)
{
struct resource res;
if (of_address_to_resource(node, 0, &res) == 0) {
gic_dist_physaddr = res.start;
pr_info("GIC physical location is %#lx\n", gic_dist_physaddr);
}
}
#else
#define gic_init_physaddr(node) do { } while (0)
#endif #endif
static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
...@@ -850,6 +993,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent) ...@@ -850,6 +993,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
percpu_offset = 0; percpu_offset = 0;
gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node); gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
if (!gic_cnt)
gic_init_physaddr(node);
if (parent) { if (parent) {
irq = irq_of_parse_and_map(node, 0); irq = irq_of_parse_and_map(node, 0);
......
...@@ -31,6 +31,8 @@ ...@@ -31,6 +31,8 @@
#define GIC_DIST_TARGET 0x800 #define GIC_DIST_TARGET 0x800
#define GIC_DIST_CONFIG 0xc00 #define GIC_DIST_CONFIG 0xc00
#define GIC_DIST_SOFTINT 0xf00 #define GIC_DIST_SOFTINT 0xf00
#define GIC_DIST_SGI_PENDING_CLEAR 0xf10
#define GIC_DIST_SGI_PENDING_SET 0xf20
#define GICH_HCR 0x0 #define GICH_HCR 0x0
#define GICH_VTR 0x4 #define GICH_VTR 0x4
...@@ -74,6 +76,11 @@ static inline void gic_init(unsigned int nr, int start, ...@@ -74,6 +76,11 @@ static inline void gic_init(unsigned int nr, int start,
gic_init_bases(nr, start, dist, cpu, 0, NULL); gic_init_bases(nr, start, dist, cpu, 0, NULL);
} }
void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
int gic_get_cpu_id(unsigned int cpu);
void gic_migrate_target(unsigned int new_cpu_id);
unsigned long gic_get_sgir_physaddr(void);
#endif /* __ASSEMBLY */ #endif /* __ASSEMBLY */
#endif #endif
#undef TRACE_SYSTEM
#define TRACE_SYSTEM power
#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_POWER_CPU_MIGRATE_H
#include <linux/tracepoint.h>
#define __cpu_migrate_proto \
TP_PROTO(u64 timestamp, \
u32 cpu_hwid)
#define __cpu_migrate_args \
TP_ARGS(timestamp, \
cpu_hwid)
DECLARE_EVENT_CLASS(cpu_migrate,
__cpu_migrate_proto,
__cpu_migrate_args,
TP_STRUCT__entry(
__field(u64, timestamp )
__field(u32, cpu_hwid )
),
TP_fast_assign(
__entry->timestamp = timestamp;
__entry->cpu_hwid = cpu_hwid;
),
TP_printk("timestamp=%llu cpu_hwid=0x%08lX",
(unsigned long long)__entry->timestamp,
(unsigned long)__entry->cpu_hwid
)
);
#define __define_cpu_migrate_event(name) \
DEFINE_EVENT(cpu_migrate, cpu_migrate_##name, \
__cpu_migrate_proto, \
__cpu_migrate_args \
)
__define_cpu_migrate_event(begin);
__define_cpu_migrate_event(finish);
__define_cpu_migrate_event(current);
#undef __define_cpu_migrate
#undef __cpu_migrate_proto
#undef __cpu_migrate_args
/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
/*
* Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate
* a whole-cluster migration:
*/
#define CPU_MIGRATE_ALL_CPUS 0x80000000U
#endif
#endif /* _TRACE_POWER_CPU_MIGRATE_H */
/* This part must be outside protection */
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE power_cpu_migrate
#include <trace/define_trace.h>
...@@ -2,3 +2,6 @@ ifndef NO_DWARF ...@@ -2,3 +2,6 @@ ifndef NO_DWARF
PERF_HAVE_DWARF_REGS := 1 PERF_HAVE_DWARF_REGS := 1
LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
endif endif
ifndef NO_LIBUNWIND
LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o
endif
#ifndef ARCH_PERF_REGS_H
#define ARCH_PERF_REGS_H
#include <stdlib.h>
#include "../../util/types.h"
#include <asm/perf_regs.h>
#define PERF_REGS_MASK ((1ULL << PERF_REG_ARM_MAX) - 1)
#define PERF_REG_IP PERF_REG_ARM_PC
#define PERF_REG_SP PERF_REG_ARM_SP
static inline const char *perf_reg_name(int id)
{
switch (id) {
case PERF_REG_ARM_R0:
return "r0";
case PERF_REG_ARM_R1:
return "r1";
case PERF_REG_ARM_R2:
return "r2";
case PERF_REG_ARM_R3:
return "r3";
case PERF_REG_ARM_R4:
return "r4";
case PERF_REG_ARM_R5:
return "r5";
case PERF_REG_ARM_R6:
return "r6";
case PERF_REG_ARM_R7:
return "r7";
case PERF_REG_ARM_R8:
return "r8";
case PERF_REG_ARM_R9:
return "r9";
case PERF_REG_ARM_R10:
return "r10";
case PERF_REG_ARM_FP:
return "fp";
case PERF_REG_ARM_IP:
return "ip";
case PERF_REG_ARM_SP:
return "sp";
case PERF_REG_ARM_LR:
return "lr";
case PERF_REG_ARM_PC:
return "pc";
default:
return NULL;
}
return NULL;
}
#endif /* ARCH_PERF_REGS_H */
#include <errno.h>
#include <libunwind.h>
#include "perf_regs.h"
#include "../../util/unwind.h"
int unwind__arch_reg_id(int regnum)
{
switch (regnum) {
case UNW_ARM_R0:
return PERF_REG_ARM_R0;
case UNW_ARM_R1:
return PERF_REG_ARM_R1;
case UNW_ARM_R2:
return PERF_REG_ARM_R2;
case UNW_ARM_R3:
return PERF_REG_ARM_R3;
case UNW_ARM_R4:
return PERF_REG_ARM_R4;
case UNW_ARM_R5:
return PERF_REG_ARM_R5;
case UNW_ARM_R6:
return PERF_REG_ARM_R6;
case UNW_ARM_R7:
return PERF_REG_ARM_R7;
case UNW_ARM_R8:
return PERF_REG_ARM_R8;
case UNW_ARM_R9:
return PERF_REG_ARM_R9;
case UNW_ARM_R10:
return PERF_REG_ARM_R10;
case UNW_ARM_R11:
return PERF_REG_ARM_FP;
case UNW_ARM_R12:
return PERF_REG_ARM_IP;
case UNW_ARM_R13:
return PERF_REG_ARM_SP;
case UNW_ARM_R14:
return PERF_REG_ARM_LR;
case UNW_ARM_R15:
return PERF_REG_ARM_PC;
default:
pr_err("unwind: invalid reg id %d\n", regnum);
return -EINVAL;
}
return -EINVAL;
}
...@@ -29,6 +29,10 @@ ifeq ($(ARCH),x86_64) ...@@ -29,6 +29,10 @@ ifeq ($(ARCH),x86_64)
NO_PERF_REGS := 0 NO_PERF_REGS := 0
LIBUNWIND_LIBS = -lunwind -lunwind-x86_64 LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
endif endif
ifeq ($(ARCH),arm)
NO_PERF_REGS := 0
LIBUNWIND_LIBS = -lunwind -lunwind-arm
endif
ifeq ($(NO_PERF_REGS),0) ifeq ($(NO_PERF_REGS),0)
CFLAGS += -DHAVE_PERF_REGS CFLAGS += -DHAVE_PERF_REGS
...@@ -208,8 +212,7 @@ ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y) ...@@ -208,8 +212,7 @@ ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y)
endif # try-cc endif # try-cc
endif # NO_LIBELF endif # NO_LIBELF
# There's only x86 (both 32 and 64) support for CFI unwind so far ifeq ($(LIBUNWIND_LIBS),)
ifneq ($(ARCH),x86)
NO_LIBUNWIND := 1 NO_LIBUNWIND := 1
endif endif
...@@ -223,9 +226,13 @@ endif ...@@ -223,9 +226,13 @@ endif
FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS) FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS)
ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y) ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y)
msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99); msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
NO_LIBUNWIND := 1 NO_LIBUNWIND := 1
endif # Libunwind support endif # Libunwind support
ifneq ($(call try-cc,$(SOURCE_LIBUNWIND_DEBUG_FRAME),$(FLAGS_UNWIND),libunwind debug_frame),y)
msg := $(warning No debug_frame support found in libunwind);
CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
endif # debug_frame support in libunwind
endif # NO_LIBUNWIND endif # NO_LIBUNWIND
ifndef NO_LIBUNWIND ifndef NO_LIBUNWIND
......
...@@ -185,7 +185,6 @@ extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, ...@@ -185,7 +185,6 @@ extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
unw_proc_info_t *pi, unw_proc_info_t *pi,
int need_unwind_info, void *arg); int need_unwind_info, void *arg);
#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
int main(void) int main(void)
...@@ -197,6 +196,26 @@ int main(void) ...@@ -197,6 +196,26 @@ int main(void)
return 0; return 0;
} }
endef endef
define SOURCE_LIBUNWIND_DEBUG_FRAME
#include <libunwind.h>
#include <stdlib.h>
extern int
UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
unw_word_t ip, unw_word_t segbase,
const char *obj_name, unw_word_t start,
unw_word_t end);
#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
int main(void)
{
dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
return 0;
}
endef
endif endif
ifndef NO_BACKTRACE ifndef NO_BACKTRACE
......
...@@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, ...@@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
extern int
UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
unw_word_t ip,
unw_word_t segbase,
const char *obj_name, unw_word_t start,
unw_word_t end);
#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
#define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */ #define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */
#define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */ #define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */
...@@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine, ...@@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
return 0; return 0;
} }
static int read_unwind_spec(struct dso *dso, struct machine *machine, static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
u64 *table_data, u64 *segbase, u64 *fde_count) u64 *table_data, u64 *segbase,
u64 *fde_count)
{ {
int ret = -EINVAL, fd; int ret = -EINVAL, fd;
u64 offset; u64 offset;
...@@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine, ...@@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
if (fd < 0) if (fd < 0)
return -EINVAL; return -EINVAL;
/* Check the .eh_frame section for unwinding info */
offset = elf_section_offset(fd, ".eh_frame_hdr"); offset = elf_section_offset(fd, ".eh_frame_hdr");
close(fd); close(fd);
...@@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine, ...@@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
table_data, segbase, table_data, segbase,
fde_count); fde_count);
/* TODO .debug_frame check if eh_frame_hdr fails */
return ret; return ret;
} }
#ifndef NO_LIBUNWIND_DEBUG_FRAME
static int read_unwind_spec_debug_frame(struct dso *dso,
struct machine *machine, u64 *offset)
{
int fd = dso__data_fd(dso, machine);
if (fd < 0)
return -EINVAL;
/* Check the .debug_frame section for unwinding info */
*offset = elf_section_offset(fd, ".debug_frame");
close(fd);
if (*offset)
return 0;
return -EINVAL;
}
#endif
static struct map *find_map(unw_word_t ip, struct unwind_info *ui) static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
{ {
struct addr_location al; struct addr_location al;
...@@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, ...@@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
pr_debug("unwind: find_proc_info dso %s\n", map->dso->name); pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
if (read_unwind_spec(map->dso, ui->machine, /* Check the .eh_frame section for unwinding info */
&table_data, &segbase, &fde_count)) if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
return -EINVAL; &table_data, &segbase, &fde_count)) {
memset(&di, 0, sizeof(di));
di.format = UNW_INFO_FORMAT_REMOTE_TABLE;
di.start_ip = map->start;
di.end_ip = map->end;
di.u.rti.segbase = map->start + segbase;
di.u.rti.table_data = map->start + table_data;
di.u.rti.table_len = fde_count * sizeof(struct table_entry)
/ sizeof(unw_word_t);
return dwarf_search_unwind_table(as, ip, &di, pi,
need_unwind_info, arg);
}
#ifndef NO_LIBUNWIND_DEBUG_FRAME
/* Check the .debug_frame section for unwinding info */
if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
memset(&di, 0, sizeof(di));
dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name,
map->start, map->end);
return dwarf_search_unwind_table(as, ip, &di, pi,
need_unwind_info, arg);
}
#endif
memset(&di, 0, sizeof(di)); return -EINVAL;
di.format = UNW_INFO_FORMAT_REMOTE_TABLE;
di.start_ip = map->start;
di.end_ip = map->end;
di.u.rti.segbase = map->start + segbase;
di.u.rti.table_data = map->start + table_data;
di.u.rti.table_len = fde_count * sizeof(struct table_entry)
/ sizeof(unw_word_t);
return dwarf_search_unwind_table(as, ip, &di, pi,
need_unwind_info, arg);
} }
static int access_fpreg(unw_addr_space_t __maybe_unused as, static int access_fpreg(unw_addr_space_t __maybe_unused as,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment