Commit f0d25b5d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm handling updates from Ingo Molnar:

 - Add new NX-stack self-test

 - Improve NUMA partial-CFMWS handling

 - Fix #VC handler bugs resulting in SEV-SNP boot failures

 - Drop the 4MB memory size restriction on minimal NUMA nodes

 - Reorganize headers a bit, in preparation to header dependency
   reduction efforts

 - Misc cleanups & fixes

* tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm: Drop the 4 MB restriction on minimal NUMA node memory size
  selftests/x86/lam: Zero out buffer for readlink()
  x86/sev: Drop unneeded #include
  x86/sev: Move sev_setup_arch() to mem_encrypt.c
  x86/tdx: Replace deprecated strncpy() with strtomem_pad()
  selftests/x86/mm: Add new test that userspace stack is in fact NX
  x86/sev: Make boot_ghcb_page[] static
  x86/boot: Move x86_cache_alignment initialization to correct spot
  x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach
  x86/sev-es: Allow copy_from_kernel_nofault() in earlier boot
  x86_64: Show CR4.PSE on auxiliaries like on BSP
  x86/iommu/docs: Update AMD IOMMU specification document URL
  x86/sev/docs: Update document URL in amd-memory-encryption.rst
  x86/mm: Move arch_memory_failure() and arch_is_platform_page() definitions from <asm/processor.h> to <asm/pgtable.h>
  ACPI/NUMA: Apply SRAT proximity domain to entire CFMWS window
  x86/numa: Introduce numa_fill_memblks()
parents 1641b9b0 a1e2b8b3
...@@ -130,4 +130,4 @@ SNP feature support. ...@@ -130,4 +130,4 @@ SNP feature support.
More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
[1] https://www.amd.com/system/files/TechDocs/40332.pdf [1] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf
...@@ -5,7 +5,7 @@ x86 IOMMU Support ...@@ -5,7 +5,7 @@ x86 IOMMU Support
The architecture specs can be obtained from the below locations. The architecture specs can be obtained from the below locations.
- Intel: http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf - Intel: http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
- AMD: https://www.amd.com/system/files/TechDocs/48882_IOMMU.pdf - AMD: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_3_07_PUB.pdf
This guide gives a quick cheat sheet for some basic understanding. This guide gives a quick cheat sheet for some basic understanding.
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "error.h" #include "error.h"
#include "../msr.h" #include "../msr.h"
struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb; struct ghcb *boot_ghcb;
/* /*
......
...@@ -119,7 +119,7 @@ static void __noreturn tdx_panic(const char *msg) ...@@ -119,7 +119,7 @@ static void __noreturn tdx_panic(const char *msg)
} message; } message;
/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
strncpy(message.str, msg, 64); strtomem_pad(message.str, msg, '\0');
args.r8 = message.r8; args.r8 = message.r8;
args.r9 = message.r9; args.r9 = message.r9;
......
...@@ -19,8 +19,10 @@ ...@@ -19,8 +19,10 @@
#ifdef CONFIG_X86_MEM_ENCRYPT #ifdef CONFIG_X86_MEM_ENCRYPT
void __init mem_encrypt_init(void); void __init mem_encrypt_init(void);
void __init mem_encrypt_setup_arch(void);
#else #else
static inline void mem_encrypt_init(void) { } static inline void mem_encrypt_init(void) { }
static inline void __init mem_encrypt_setup_arch(void) { }
#endif #endif
#ifdef CONFIG_AMD_MEM_ENCRYPT #ifdef CONFIG_AMD_MEM_ENCRYPT
...@@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data); ...@@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data);
void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void); void __init sme_early_init(void);
void __init sev_setup_arch(void);
void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_encrypt_kernel(struct boot_params *bp);
void __init sme_enable(struct boot_params *bp); void __init sme_enable(struct boot_params *bp);
...@@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } ...@@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { } static inline void __init sme_early_init(void) { }
static inline void __init sev_setup_arch(void) { }
static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { }
......
...@@ -12,13 +12,6 @@ ...@@ -12,13 +12,6 @@
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
/*
* Too small node sizes may confuse the VM badly. Usually they
* result from BIOS bugs. So dont recognize nodes as standalone
* NUMA entities that have less than this amount of RAM listed:
*/
#define NODE_MIN_SIZE (4*1024*1024)
extern int numa_off; extern int numa_off;
/* /*
......
...@@ -1716,6 +1716,14 @@ static inline bool pud_user_accessible_page(pud_t pud) ...@@ -1716,6 +1716,14 @@ static inline bool pud_user_accessible_page(pud_t pud)
} }
#endif #endif
#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure
bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */ #endif /* _ASM_X86_PGTABLE_H */
...@@ -724,14 +724,6 @@ enum mds_mitigations { ...@@ -724,14 +724,6 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV, MDS_MITIGATION_VMWERV,
}; };
#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure
bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif
extern bool gds_ucode_mitigated(void); extern bool gds_ucode_mitigated(void);
#endif /* _ASM_X86_PROCESSOR_H */ #endif /* _ASM_X86_PROCESSOR_H */
...@@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start); ...@@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start);
#define phys_to_target_node phys_to_target_node #define phys_to_target_node phys_to_target_node
extern int memory_add_physaddr_to_nid(u64 start); extern int memory_add_physaddr_to_nid(u64 start);
#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
extern int numa_fill_memblks(u64 start, u64 end);
#define numa_fill_memblks numa_fill_memblks
#endif #endif
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
......
...@@ -1115,18 +1115,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c) ...@@ -1115,18 +1115,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{ {
u32 eax, ebx, ecx, edx; u32 eax, ebx, ecx, edx;
bool vp_bits_from_cpuid = true;
if (c->extended_cpuid_level >= 0x80000008) { if (!cpu_has(c, X86_FEATURE_CPUID) ||
(c->extended_cpuid_level < 0x80000008))
vp_bits_from_cpuid = false;
if (vp_bits_from_cpuid) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx); cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff; c->x86_phys_bits = eax & 0xff;
} } else {
#ifdef CONFIG_X86_32 if (IS_ENABLED(CONFIG_X86_64)) {
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) c->x86_clflush_size = 64;
c->x86_phys_bits = 36; c->x86_phys_bits = 36;
#endif c->x86_virt_bits = 48;
} else {
c->x86_clflush_size = 32;
c->x86_virt_bits = 32;
c->x86_phys_bits = 32;
if (cpu_has(c, X86_FEATURE_PAE) ||
cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
}
}
c->x86_cache_bits = c->x86_phys_bits; c->x86_cache_bits = c->x86_phys_bits;
c->x86_cache_alignment = c->x86_clflush_size;
} }
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
...@@ -1580,17 +1596,6 @@ static void __init cpu_parse_early_param(void) ...@@ -1580,17 +1596,6 @@ static void __init cpu_parse_early_param(void)
*/ */
static void __init early_identify_cpu(struct cpuinfo_x86 *c) static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{ {
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
c->x86_virt_bits = 48;
#else
c->x86_clflush_size = 32;
c->x86_phys_bits = 32;
c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof(c->x86_capability)); memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0; c->extended_cpuid_level = 0;
...@@ -1602,7 +1607,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) ...@@ -1602,7 +1607,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_detect(c); cpu_detect(c);
get_cpu_vendor(c); get_cpu_vendor(c);
get_cpu_cap(c); get_cpu_cap(c);
get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_CPUID); setup_force_cpu_cap(X86_FEATURE_CPUID);
cpu_parse_early_param(); cpu_parse_early_param();
...@@ -1618,6 +1622,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) ...@@ -1618,6 +1622,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_CPUID); setup_clear_cpu_cap(X86_FEATURE_CPUID);
} }
get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_ALWAYS); setup_force_cpu_cap(X86_FEATURE_ALWAYS);
cpu_set_bug_bits(c); cpu_set_bug_bits(c);
......
...@@ -179,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ...@@ -179,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movl $0, %ecx movl $0, %ecx
#endif #endif
/* Enable PAE mode, PGE and LA57 */ /* Enable PAE mode, PSE, PGE and LA57 */
orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL #ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip) testl $1, __pgtable_l5_enabled(%rip)
jz 1f jz 1f
......
...@@ -1120,7 +1120,7 @@ void __init setup_arch(char **cmdline_p) ...@@ -1120,7 +1120,7 @@ void __init setup_arch(char **cmdline_p)
* Needs to run after memblock setup because it needs the physical * Needs to run after memblock setup because it needs the physical
* memory size. * memory size.
*/ */
sev_setup_arch(); mem_encrypt_setup_arch();
efi_fake_memmap(); efi_fake_memmap();
efi_find_mirror(); efi_find_mirror();
......
...@@ -9,12 +9,21 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) ...@@ -9,12 +9,21 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
unsigned long vaddr = (unsigned long)unsafe_src; unsigned long vaddr = (unsigned long)unsafe_src;
/* /*
* Range covering the highest possible canonical userspace address * Do not allow userspace addresses. This disallows
* as well as non-canonical address range. For the canonical range * normal userspace and the userspace guard page:
* we also need to include the userspace guard page.
*/ */
return vaddr >= TASK_SIZE_MAX + PAGE_SIZE && if (vaddr < TASK_SIZE_MAX + PAGE_SIZE)
__is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); return false;
/*
* Allow everything during early boot before 'x86_virt_bits'
* is initialized. Needed for instruction decoding in early
* exception handlers.
*/
if (!boot_cpu_data.x86_virt_bits)
return true;
return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
} }
#else #else
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <linux/swiotlb.h> #include <linux/swiotlb.h>
#include <linux/cc_platform.h> #include <linux/cc_platform.h>
#include <linux/mem_encrypt.h> #include <linux/mem_encrypt.h>
#include <linux/virtio_anchor.h>
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev) bool force_dma_unencrypted(struct device *dev)
...@@ -86,3 +87,36 @@ void __init mem_encrypt_init(void) ...@@ -86,3 +87,36 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info(); print_mem_encrypt_feature_info();
} }
void __init mem_encrypt_setup_arch(void)
{
phys_addr_t total_mem = memblock_phys_mem_size();
unsigned long size;
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
/*
* For SEV and TDX, all DMA has to occur via shared/unencrypted pages.
* Kernel uses SWIOTLB to make this happen without changing device
* drivers. However, depending on the workload being run, the
* default 64MB of SWIOTLB may not be enough and SWIOTLB may
* run out of buffers for DMA, resulting in I/O errors and/or
* performance degradation especially with high I/O workloads.
*
* Adjust the default size of SWIOTLB using a percentage of guest
* memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer
* memory is allocated from low memory, ensure that the adjusted size
* is within the limits of low available memory.
*
* The percentage of guest memory used here for SWIOTLB buffers
* is more of an approximation of the static adjustment which
* 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
*/
size = total_mem * 6 / 100;
size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
swiotlb_adjust_size(size);
/* Set restricted memory access for virtio. */
virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
}
...@@ -19,8 +19,6 @@ ...@@ -19,8 +19,6 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
#include <linux/virtio_anchor.h>
#include <linux/cc_platform.h> #include <linux/cc_platform.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
...@@ -215,40 +213,6 @@ void __init sme_map_bootdata(char *real_mode_data) ...@@ -215,40 +213,6 @@ void __init sme_map_bootdata(char *real_mode_data)
__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
} }
void __init sev_setup_arch(void)
{
phys_addr_t total_mem = memblock_phys_mem_size();
unsigned long size;
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
/*
* For SEV, all DMA has to occur via shared/unencrypted pages.
* SEV uses SWIOTLB to make this happen without changing device
* drivers. However, depending on the workload being run, the
* default 64MB of SWIOTLB may not be enough and SWIOTLB may
* run out of buffers for DMA, resulting in I/O errors and/or
* performance degradation especially with high I/O workloads.
*
* Adjust the default size of SWIOTLB for SEV guests using
* a percentage of guest memory for SWIOTLB buffers.
* Also, as the SWIOTLB bounce buffer memory is allocated
* from low memory, ensure that the adjusted size is within
* the limits of low available memory.
*
* The percentage of guest memory used here for SWIOTLB buffers
* is more of an approximation of the static adjustment which
* 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
*/
size = total_mem * 6 / 100;
size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
swiotlb_adjust_size(size);
/* Set restricted memory access for virtio. */
virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
}
static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
{ {
unsigned long pfn = 0; unsigned long pfn = 0;
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <linux/nodemask.h> #include <linux/nodemask.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/topology.h> #include <linux/topology.h>
#include <linux/sort.h>
#include <asm/e820/api.h> #include <asm/e820/api.h>
#include <asm/proto.h> #include <asm/proto.h>
...@@ -602,13 +603,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) ...@@ -602,13 +603,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
if (start >= end) if (start >= end)
continue; continue;
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
if (end && (end - start) < NODE_MIN_SIZE)
continue;
alloc_node_data(nid); alloc_node_data(nid);
} }
...@@ -964,4 +958,83 @@ int memory_add_physaddr_to_nid(u64 start) ...@@ -964,4 +958,83 @@ int memory_add_physaddr_to_nid(u64 start)
return nid; return nid;
} }
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
static int __init cmp_memblk(const void *a, const void *b)
{
const struct numa_memblk *ma = *(const struct numa_memblk **)a;
const struct numa_memblk *mb = *(const struct numa_memblk **)b;
return ma->start - mb->start;
}
static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
/**
* numa_fill_memblks - Fill gaps in numa_meminfo memblks
* @start: address to begin fill
* @end: address to end fill
*
* Find and extend numa_meminfo memblks to cover the @start-@end
* physical address range, such that the first memblk includes
* @start, the last memblk includes @end, and any gaps in between
* are filled.
*
* RETURNS:
* 0 : Success
* NUMA_NO_MEMBLK : No memblk exists in @start-@end range
*/
int __init numa_fill_memblks(u64 start, u64 end)
{
struct numa_memblk **blk = &numa_memblk_list[0];
struct numa_meminfo *mi = &numa_meminfo;
int count = 0;
u64 prev_end;
/*
* Create a list of pointers to numa_meminfo memblks that
* overlap start, end. Exclude (start == bi->end) since
* end addresses in both a CFMWS range and a memblk range
* are exclusive.
*
* This list of pointers is used to make in-place changes
* that fill out the numa_meminfo memblks.
*/
for (int i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
if (start < bi->end && end >= bi->start) {
blk[count] = &mi->blk[i];
count++;
}
}
if (!count)
return NUMA_NO_MEMBLK;
/* Sort the list of pointers in memblk->start order */
sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
/* Make sure the first/last memblks include start/end */
blk[0]->start = min(blk[0]->start, start);
blk[count - 1]->end = max(blk[count - 1]->end, end);
/*
* Fill any gaps by tracking the previous memblks
* end address and backfilling to it if needed.
*/
prev_end = blk[0]->end;
for (int i = 1; i < count; i++) {
struct numa_memblk *curr = blk[i];
if (prev_end >= curr->start) {
if (prev_end < curr->end)
prev_end = curr->end;
} else {
curr->start = prev_end;
prev_end = curr->end;
}
}
return 0;
}
#endif #endif
...@@ -310,11 +310,16 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header, ...@@ -310,11 +310,16 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
start = cfmws->base_hpa; start = cfmws->base_hpa;
end = cfmws->base_hpa + cfmws->window_size; end = cfmws->base_hpa + cfmws->window_size;
/* Skip if the SRAT already described the NUMA details for this HPA */ /*
node = phys_to_target_node(start); * The SRAT may have already described NUMA details for all,
if (node != NUMA_NO_NODE) * or a portion of, this CFMWS HPA range. Extend the memblks
* found for any portion of the window to cover the entire
* window.
*/
if (!numa_fill_memblks(start, end))
return 0; return 0;
/* No SRAT description. Create a new node. */
node = acpi_map_pxm_to_node(*fake_pxm); node = acpi_map_pxm_to_node(*fake_pxm);
if (node == NUMA_NO_NODE) { if (node == NUMA_NO_NODE) {
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#define MAX_NUMNODES (1 << NODES_SHIFT) #define MAX_NUMNODES (1 << NODES_SHIFT)
#define NUMA_NO_NODE (-1) #define NUMA_NO_NODE (-1)
#define NUMA_NO_MEMBLK (-1)
/* optionally keep NUMA memory info available post init */ /* optionally keep NUMA memory info available post init */
#ifdef CONFIG_NUMA_KEEP_MEMINFO #ifdef CONFIG_NUMA_KEEP_MEMINFO
...@@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start) ...@@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start)
return 0; return 0;
} }
#endif #endif
#ifndef numa_fill_memblks
static inline int __init numa_fill_memblks(u64 start, u64 end)
{
return NUMA_NO_MEMBLK;
}
#endif
#else /* !CONFIG_NUMA */ #else /* !CONFIG_NUMA */
static inline int numa_nearest_node(int node, unsigned int state) static inline int numa_nearest_node(int node, unsigned int state)
{ {
......
...@@ -14,6 +14,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap ...@@ -14,6 +14,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
check_initial_reg_state sigreturn iopl ioperm \ check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \ test_vsyscall mov_ss_trap \
syscall_arg_fault fsgsbase_restore sigaltstack syscall_arg_fault fsgsbase_restore sigaltstack
TARGETS_C_BOTHBITS += nx_stack
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \ test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer vdso_restorer
...@@ -109,3 +110,6 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S ...@@ -109,3 +110,6 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
# state. # state.
$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static $(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack
$(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack
...@@ -573,7 +573,7 @@ int do_uring(unsigned long lam) ...@@ -573,7 +573,7 @@ int do_uring(unsigned long lam)
char path[PATH_MAX] = {0}; char path[PATH_MAX] = {0};
/* get current process path */ /* get current process path */
if (readlink("/proc/self/exe", path, PATH_MAX) <= 0) if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
return 1; return 1;
int file_fd = open(path, O_RDONLY); int file_fd = open(path, O_RDONLY);
...@@ -680,14 +680,14 @@ static int handle_execve(struct testcases *test) ...@@ -680,14 +680,14 @@ static int handle_execve(struct testcases *test)
perror("Fork failed."); perror("Fork failed.");
ret = 1; ret = 1;
} else if (pid == 0) { } else if (pid == 0) {
char path[PATH_MAX]; char path[PATH_MAX] = {0};
/* Set LAM mode in parent process */ /* Set LAM mode in parent process */
if (set_lam(lam) != 0) if (set_lam(lam) != 0)
return 1; return 1;
/* Get current binary's path and the binary was run by execve */ /* Get current binary's path and the binary was run by execve */
if (readlink("/proc/self/exe", path, PATH_MAX) <= 0) if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
exit(-1); exit(-1);
/* run binary to get LAM mode and return to parent process */ /* run binary to get LAM mode and return to parent process */
......
/*
* Copyright (c) 2023 Alexey Dobriyan <adobriyan@gmail.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Test that userspace stack is NX. Requires linking with -Wl,-z,noexecstack
* because I don't want to bother with PT_GNU_STACK detection.
*
* Fill the stack with INT3's and then try to execute some of them:
* SIGSEGV -- good, SIGTRAP -- bad.
*
* Regular stack is completely overwritten before testing.
* Test doesn't exit SIGSEGV handler after first fault at INT3.
*/
#undef _GNU_SOURCE
#define _GNU_SOURCE
#undef NDEBUG
#include <assert.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <unistd.h>
#define PAGE_SIZE 4096
/*
* This is memset(rsp, 0xcc, -1); but down.
* It will SIGSEGV when bottom of the stack is reached.
* Byte-size access is important! (see rdi tweak in the signal handler).
*/
void make_stack1(void);
asm(
".pushsection .text\n"
".globl make_stack1\n"
".align 16\n"
"make_stack1:\n"
"mov $0xcc, %al\n"
#if defined __amd64__
"mov %rsp, %rdi\n"
"mov $-1, %rcx\n"
#elif defined __i386__
"mov %esp, %edi\n"
"mov $-1, %ecx\n"
#else
#error
#endif
"std\n"
"rep stosb\n"
/* unreachable */
"hlt\n"
".type make_stack1,@function\n"
".size make_stack1,.-make_stack1\n"
".popsection\n"
);
/*
* memset(p, 0xcc, -1);
* It will SIGSEGV when top of the stack is reached.
*/
void make_stack2(uint64_t p);
asm(
".pushsection .text\n"
".globl make_stack2\n"
".align 16\n"
"make_stack2:\n"
"mov $0xcc, %al\n"
#if defined __amd64__
"mov $-1, %rcx\n"
#elif defined __i386__
"mov $-1, %ecx\n"
#else
#error
#endif
"cld\n"
"rep stosb\n"
/* unreachable */
"hlt\n"
".type make_stack2,@function\n"
".size make_stack2,.-make_stack2\n"
".popsection\n"
);
static volatile int test_state = 0;
static volatile unsigned long stack_min_addr;
#if defined __amd64__
#define RDI REG_RDI
#define RIP REG_RIP
#define RIP_STRING "rip"
#elif defined __i386__
#define RDI REG_EDI
#define RIP REG_EIP
#define RIP_STRING "eip"
#else
#error
#endif
static void sigsegv(int _, siginfo_t *__, void *uc_)
{
/*
* Some Linux versions didn't clear DF before entering signal
* handler. make_stack1() doesn't have a chance to clear DF
* either so we clear it by hand here.
*/
asm volatile ("cld" ::: "memory");
ucontext_t *uc = uc_;
if (test_state == 0) {
/* Stack is faulted and cleared from RSP to the lowest address. */
stack_min_addr = ++uc->uc_mcontext.gregs[RDI];
if (1) {
printf("stack min %lx\n", stack_min_addr);
}
uc->uc_mcontext.gregs[RIP] = (uintptr_t)&make_stack2;
test_state = 1;
} else if (test_state == 1) {
/* Stack has been cleared from top to bottom. */
unsigned long stack_max_addr = uc->uc_mcontext.gregs[RDI];
if (1) {
printf("stack max %lx\n", stack_max_addr);
}
/* Start faulting pages on stack and see what happens. */
uc->uc_mcontext.gregs[RIP] = stack_max_addr - PAGE_SIZE;
test_state = 2;
} else if (test_state == 2) {
/* Stack page is NX -- good, test next page. */
uc->uc_mcontext.gregs[RIP] -= PAGE_SIZE;
if (uc->uc_mcontext.gregs[RIP] == stack_min_addr) {
/* One more SIGSEGV and test ends. */
test_state = 3;
}
} else {
printf("PASS\tAll stack pages are NX\n");
_exit(EXIT_SUCCESS);
}
}
static void sigtrap(int _, siginfo_t *__, void *uc_)
{
const ucontext_t *uc = uc_;
unsigned long rip = uc->uc_mcontext.gregs[RIP];
printf("FAIL\texecutable page on the stack: " RIP_STRING " %lx\n", rip);
_exit(EXIT_FAILURE);
}
int main(void)
{
{
struct sigaction act = {};
sigemptyset(&act.sa_mask);
act.sa_flags = SA_SIGINFO;
act.sa_sigaction = &sigsegv;
int rv = sigaction(SIGSEGV, &act, NULL);
assert(rv == 0);
}
{
struct sigaction act = {};
sigemptyset(&act.sa_mask);
act.sa_flags = SA_SIGINFO;
act.sa_sigaction = &sigtrap;
int rv = sigaction(SIGTRAP, &act, NULL);
assert(rv == 0);
}
{
struct rlimit rlim;
int rv = getrlimit(RLIMIT_STACK, &rlim);
assert(rv == 0);
/* Cap stack at time-honored 8 MiB value. */
rlim.rlim_max = rlim.rlim_cur;
if (rlim.rlim_max > 8 * 1024 * 1024) {
rlim.rlim_max = 8 * 1024 * 1024;
}
rv = setrlimit(RLIMIT_STACK, &rlim);
assert(rv == 0);
}
{
/*
* We don't know now much stack SIGSEGV handler uses.
* Bump this by 1 page every time someone complains,
* or rewrite it in assembly.
*/
const size_t len = SIGSTKSZ;
void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
assert(p != MAP_FAILED);
stack_t ss = {};
ss.ss_sp = p;
ss.ss_size = len;
int rv = sigaltstack(&ss, NULL);
assert(rv == 0);
}
make_stack1();
/*
* Unreachable, but if _this_ INT3 is ever reached, it's a bug somewhere.
* Fold it into main SIGTRAP pathway.
*/
__builtin_trap();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment