Commit d950b958 authored by Naoya Horiguchi's avatar Naoya Horiguchi Committed by Andi Kleen

HWPOISON, hugetlb: soft offlining for hugepage

This patch extends soft offlining framework to support hugepage.
When memory corrected errors occur repeatedly on a hugepage,
we can choose to stop using it by migrating data onto another hugepage
and disabling the original (maybe half-broken) one.

ChangeLog since v4:
- branch soft_offline_page() for hugepage

ChangeLog since v3:
- remove comment about "ToDo: hugepage soft-offline"

ChangeLog since v2:
- move refcount handling into isolate_lru_page()

ChangeLog since v1:
- add double check in isolating hwpoisoned hugepage
- define free/non-free checker for hugepage
- postpone calling put_page() for hugepage in soft_offline_page()
Signed-off-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: default avatarJun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: default avatarMel Gorman <mel@csn.ul.ie>
Signed-off-by: default avatarAndi Kleen <ak@linux.intel.com>
parent 8c6c2ecb
...@@ -693,8 +693,6 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) ...@@ -693,8 +693,6 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* Issues: * Issues:
* - Error on hugepage is contained in hugepage unit (not in raw page unit.) * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd. * To narrow down kill region to one page, we need to break up pmd.
* - To support soft-offlining for hugepage, we need to support hugepage
* migration.
*/ */
static int me_huge_page(struct page *p, unsigned long pfn) static int me_huge_page(struct page *p, unsigned long pfn)
{ {
...@@ -1220,6 +1218,10 @@ EXPORT_SYMBOL(unpoison_memory); ...@@ -1220,6 +1218,10 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x) static struct page *new_page(struct page *p, unsigned long private, int **x)
{ {
int nid = page_to_nid(p); int nid = page_to_nid(p);
if (PageHuge(p))
return alloc_huge_page_node(page_hstate(compound_head(p)),
nid);
else
return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
} }
...@@ -1248,8 +1250,15 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ...@@ -1248,8 +1250,15 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* was free. * was free.
*/ */
set_migratetype_isolate(p); set_migratetype_isolate(p);
/*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
if (!get_page_unless_zero(compound_head(p))) { if (!get_page_unless_zero(compound_head(p))) {
if (is_free_buddy_page(p)) { if (PageHuge(p)) {
pr_debug("get_any_page: %#lx free huge page\n", pfn);
ret = dequeue_hwpoisoned_huge_page(compound_head(p));
} else if (is_free_buddy_page(p)) {
pr_debug("get_any_page: %#lx free buddy page\n", pfn); pr_debug("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */ /* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p); SetPageHWPoison(p);
...@@ -1268,6 +1277,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ...@@ -1268,6 +1277,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return ret; return ret;
} }
static int soft_offline_huge_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_head(page);
LIST_HEAD(pagelist);
ret = get_any_page(page, pfn, flags);
if (ret < 0)
return ret;
if (ret == 0)
goto done;
if (PageHWPoison(hpage)) {
put_page(hpage);
pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
/* Keep page count to indicate a given hugepage is isolated. */
list_add(&hpage->lru, &pagelist);
ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
if (ret) {
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
return ret;
}
done:
if (!PageHWPoison(hpage))
atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
/* keep elevated page count for bad page */
return ret;
}
/** /**
* soft_offline_page - Soft offline a page. * soft_offline_page - Soft offline a page.
* @page: page to offline * @page: page to offline
...@@ -1295,6 +1343,9 @@ int soft_offline_page(struct page *page, int flags) ...@@ -1295,6 +1343,9 @@ int soft_offline_page(struct page *page, int flags)
int ret; int ret;
unsigned long pfn = page_to_pfn(page); unsigned long pfn = page_to_pfn(page);
if (PageHuge(page))
return soft_offline_huge_page(page, flags);
ret = get_any_page(page, pfn, flags); ret = get_any_page(page, pfn, flags);
if (ret < 0) if (ret < 0)
return ret; return ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment