/* Wendelin.bigfile | Virtual memory
 * Copyright (C) 2014-2019  Nexedi SA and Contributors.
 *                          Kirill Smelkov <kirr@nexedi.com>
 *
 * This program is free software: you can Use, Study, Modify and Redistribute
 * it under the terms of the GNU General Public License version 3, or (at your
 * option) any later version, as published by the Free Software Foundation.
 *
 * You can also Link and Combine this program with other software covered by
 * the terms of any of the Free Software licenses or any of the Open Source
 * Initiative approved licenses and Convey the resulting work. Corresponding
 * source of such a combination shall include the source code for all other
 * software used.
 *
 * This program is distributed WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * See COPYING file for full licensing terms.
 * See https://www.nexedi.com/licensing for rationale and options.
 *
 *
 * TODO description
 */

#include <wendelin/bigfile/virtmem.h>
#include <wendelin/bigfile/file.h>
#include <wendelin/bigfile/pagemap.h>
#include <wendelin/bigfile/ram.h>
#include <wendelin/utils.h>
#include <wendelin/bug.h>

#include <ccan/minmax/minmax.h>
#include <ccan/bitmap/bitmap.h>

#include <sys/mman.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>

static size_t   page_size(const Page *page);
static void     page_drop_memory(Page *page);
static void     page_del(Page *page);
static void    *vma_page_addr(VMA *vma, Page *page);
static pgoff_t  vma_addr_fpgoffset(VMA *vma, uintptr_t addr);
static void     vma_mmap_page(VMA *vma, Page *page);
static int      vma_page_ismapped(VMA *vma, Page *page);
static void     vma_page_ensure_unmapped(VMA *vma, Page *page);
static void     vma_page_ensure_notmappedrw(VMA *vma, Page *page);
static int      __ram_reclaim(RAM *ram);

#define VIRT_DEBUG   0
#if VIRT_DEBUG
# define TRACE(msg, ...) do { fprintf(stderr, msg, ## __VA_ARGS__); } while (0)
#else
# define TRACE(msg, ...) do {} while(0)
#endif


/* global lock which protects manipulating virtmem data structures
 *
 * NOTE not scalable, but this is temporary solution - as we are going to move
 * memory management back into the kernel, where it is done properly. */
static pthread_mutex_t virtmem_lock = PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP;
static const VirtGilHooks *virtmem_gilhooks;

void *virt_gil_ensure_unlocked(void)
{
    void *gilstate = NULL;

    if (virtmem_gilhooks)
        gilstate = virtmem_gilhooks->gil_ensure_unlocked();

    return gilstate;
}


void virt_gil_retake_if_waslocked(void *gilstate)
{
    if (gilstate)
        virtmem_gilhooks->gil_retake_if_waslocked(gilstate);
}

void virt_lock()
{
    void *gilstate = NULL;

    /* make sure we don't hold e.g. python GIL (not to deadlock, as GIL oscillates) */
    gilstate = virt_gil_ensure_unlocked();

    /* acquire virtmem lock */
    xpthread_mutex_lock(&virtmem_lock);

    /* retake GIL if we were holding it originally */
    virt_gil_retake_if_waslocked(gilstate);
}

void virt_unlock()
{
    xpthread_mutex_unlock(&virtmem_lock);
}


void virt_lock_hookgil(const VirtGilHooks *gilhooks)
{
    // FIXME we hit vvv assert for real because `import bigfile,
    // wendelin.bigfile` import bigfile/__init__.py twice and that in turn
    // imports bigfile/_bigfile.so twice. However Python loads _bigfile.so DSO
    // only once - oops. The bug happens in practice when running tests via
    // pytest under python3.
    //
    // XXX temp hack to workaround that bug for now.
    if (virtmem_gilhooks == gilhooks)
	    virtmem_gilhooks = NULL;

    BUG_ON(virtmem_gilhooks);       /* prevent registering multiple times */
    virtmem_gilhooks = gilhooks;
}


/* block/restore SIGSEGV for current thread - non on-pagefault code should not
 * access any not-mmapped memory -> so on any pagefault we should just die with
 * coredump, not try to incorrectly handle the pagefault.
 *
 * NOTE sigmask is per-thread. When blocking there is no race wrt other threads
 * correctly accessing data via pagefaulting.   */
static void sigsegv_block(sigset_t *save_sigset)
{
    sigset_t mask_segv;
    xsigemptyset(&mask_segv);
    xsigaddset(&mask_segv, SIGSEGV);
    xpthread_sigmask(SIG_BLOCK, &mask_segv, save_sigset);
}

static void sigsegv_restore(const sigset_t *save_sigset)
{
    int how = xsigismember(save_sigset, SIGSEGV) ? SIG_BLOCK : SIG_UNBLOCK;
    sigset_t mask_segv;

    xsigemptyset(&mask_segv);
    xsigaddset(&mask_segv, SIGSEGV);
    xpthread_sigmask(how, &mask_segv, NULL);
}


/****************
 * OPEN / CLOSE *
 ****************/

int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram, FileHOpenFlags flags)
{
    int err = 0;
    sigset_t save_sigset;
    const bigfile_ops *fops = file->file_ops;

    if (!(flags == 0 || flags == MMAP_OVERLAY || flags == DONT_MMAP_OVERLAY))
        return -EINVAL;
    if (flags == 0) {
        flags = fops->mmap_setup_read ? MMAP_OVERLAY : DONT_MMAP_OVERLAY;
    }
    if (flags & MMAP_OVERLAY && flags & DONT_MMAP_OVERLAY)
        return -EINVAL;
    if (flags == MMAP_OVERLAY) {
        ASSERT(fops->mmap_setup_read);
        ASSERT(fops->remmap_blk_read);
    }
    if (flags == DONT_MMAP_OVERLAY) {
        ASSERT(fops->loadblk);
    }

    sigsegv_block(&save_sigset);
    virt_lock();

    bzero(fileh, sizeof(*fileh));
    fileh->ramh = ramh_open(ram);
    if (!fileh->ramh) {
        err = -1;
        goto out;
    }

    fileh->file = file;
    INIT_LIST_HEAD(&fileh->mmaps);
    INIT_LIST_HEAD(&fileh->dirty_pages);
    fileh->writeout_inprogress = 0;
    pagemap_init(&fileh->pagemap, ilog2_exact(ram->pagesize));

    fileh->mmap_overlay = (flags == MMAP_OVERLAY);

out:
    virt_unlock();
    sigsegv_restore(&save_sigset);
    return err;
}


void fileh_close(BigFileH *fileh)
{
    Page *page;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

    /* it's an error to close fileh with existing mappings */
    // XXX implement the same semantics usual files have wrt mmaps - if we release
    // fileh, but mapping exists - real fileh release is delayed to last unmap ?
    BUG_ON(!list_empty(&fileh->mmaps));

    /* it's an error to close fileh while writeout is in progress */
    BUG_ON(fileh->writeout_inprogress);

    /* drop all pages (dirty or not) associated with this fileh */
    pagemap_for_each(page, &fileh->pagemap) {
        /* it's an error to close fileh to mapping of which an access is
         * currently being done in another thread */
        BUG_ON(page->state == PAGE_LOADING);
        page_drop_memory(page);
        page_del(page);
//        list_del(&page->lru);
//        bzero(page, sizeof(*page)); /* just in case */
//        free(page);
    }

    BUG_ON(!list_empty(&fileh->dirty_pages));

    /* and clear pagemap */
    pagemap_clear(&fileh->pagemap);

    if (fileh->ramh)
        ramh_close(fileh->ramh);

    bzero(fileh, sizeof(*fileh));
    virt_unlock();
    sigsegv_restore(&save_sigset);
}



/****************
 * MMAP / UNMAP *
 ****************/

int fileh_mmap(VMA *vma, BigFileH *fileh, pgoff_t pgoffset, pgoff_t pglen)
{
    void *addr;
    size_t len = pglen * fileh->ramh->ram->pagesize;
    BigFile *file = fileh->file;
    const bigfile_ops *fops = file->file_ops;
    int err = 0;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

    /* start preparing vma */
    bzero(vma, sizeof(*vma));
    vma->fileh       = fileh;
    vma->f_pgoffset  = pgoffset;

    /* alloc vma->page_ismappedv[] */
    vma->page_ismappedv = bitmap_alloc0(pglen);
    if (!vma->page_ismappedv)
        goto fail;

    if (fileh->mmap_overlay) {
        /* wcfs: mmap(base, READ) */
        TODO (file->blksize != fileh->ramh->ram->pagesize);
        addr = fops->mmap_setup_read(file, pgoffset, pglen, vma);
    } else {
        /* !wcfs: allocate address space somewhere */
        addr = mem_valloc(NULL, len);
    }
    if (!addr)
        goto fail;

    /* vma address range known */
    vma->addr_start  = (uintptr_t)addr;
    vma->addr_stop   = vma->addr_start + len;

    /* wcfs: mmap(fileh->dirty_pages) over base */
    if (fileh->mmap_overlay) {
        Page* page;
        struct list_head *hpage;

        list_for_each(hpage, &fileh->dirty_pages) {
            page = list_entry(hpage, typeof(*page), in_dirty);
            BUG_ON(page->state != PAGE_DIRTY);

            if (!(pgoffset <= page->f_pgoffset && page->f_pgoffset < pgoffset + pglen))
                continue; /* page is out of requested mmap coverage */

            // XXX notify watcher that we mmap RAM page in its range? -> no need
            vma_mmap_page(vma, page);
        }
    }

    /* everything allocated - link it up */

    // XXX need to init vma->virt_list first?
    /* hook vma to fileh->mmaps */
    list_add_tail(&vma->same_fileh, &fileh->mmaps);

    /* register vma for pagefault handling */
    virt_register_vma(vma);

out:
    virt_unlock();
    sigsegv_restore(&save_sigset);
    return err;

fail:
    free(vma->page_ismappedv);
    bzero(vma, sizeof(*vma));
    err = -1;
    goto out;
}


void vma_unmap(VMA *vma)
{
    BigFileH *fileh = vma->fileh;
    size_t   len    = vma->addr_stop - vma->addr_start;
    size_t   pglen  = len / fileh->ramh->ram->pagesize;
    int i;
    pgoff_t  pgoffset;
    Page *page;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

    /* unregister from vmamap - so that pagefault handler does not recognize
     * this area as valid */
    virt_unregister_vma(vma);

    /* unlink from fileh.mmaps   XXX place ok ? */
    list_del_init(&vma->same_fileh);

    /* unmap whole vma at once - the kernel unmaps each mapping in turn.
     * NOTE error here would mean something is broken */
    if (fileh->mmap_overlay) {
        fileh->file->file_ops->munmap(fileh->file, vma);
    } else {
        xmunmap((void *)vma->addr_start, len);
    }

    /* scan through mapped-to-this-vma pages and release them */
    for (i=0; i < pglen; ++i) {
        if (!bitmap_test_bit(vma->page_ismappedv, i))
            continue;

        pgoffset = vma->f_pgoffset + i;
        page = pagemap_get(&fileh->pagemap, pgoffset);
        BUG_ON(!page);
        page_decref(page);
    }

    /* free memory and be done */
    free(vma->page_ismappedv);

    bzero(vma, sizeof(*vma));
    virt_unlock();
    sigsegv_restore(&save_sigset);
}


/**********************
 * WRITEOUT / DISCARD *
 **********************/

/* helper for sorting dirty pages by ->f_pgoffset */
static int hpage_indirty_cmp_bypgoffset(struct list_head *hpage1, struct list_head *hpage2, void *_)
{
    Page *page1 = list_entry(hpage1, typeof(*page1), in_dirty);
    Page *page2 = list_entry(hpage2, typeof(*page2), in_dirty);

    if (page1->f_pgoffset < page2->f_pgoffset)
        return -1;
    if (page1->f_pgoffset > page2->f_pgoffset)
        return +1;
    return 0;
}

int fileh_dirty_writeout(BigFileH *fileh, enum WriteoutFlags flags)
{
    Page *page;
    BigFile *file = fileh->file;
    struct list_head *hpage, *hpage_next, *hmmap;
    sigset_t save_sigset;
    int err = 0;

    /* check flags */
    if (!(flags &  (WRITEOUT_STORE | WRITEOUT_MARKSTORED))   ||
          flags & ~(WRITEOUT_STORE | WRITEOUT_MARKSTORED))
        return -EINVAL;

    sigsegv_block(&save_sigset);
    virt_lock();

    /* concurrent writeouts are not allowed */
    BUG_ON(fileh->writeout_inprogress);
    fileh->writeout_inprogress = 1;

    /* pages are stored (if stored) in sorted order */
    if (flags & WRITEOUT_STORE)
        list_sort(&fileh->dirty_pages, hpage_indirty_cmp_bypgoffset, NULL);

    /* write out dirty pages */
    list_for_each_safe(hpage, hpage_next, &fileh->dirty_pages) {
        page = list_entry(hpage, typeof(*page), in_dirty);
        BUG_ON(page->state != PAGE_DIRTY);

        /* ->storeblk() */
        if (flags & WRITEOUT_STORE) {
            TODO (file->blksize != page_size(page));
            blk_t blk = page->f_pgoffset;   // NOTE assumes blksize = pagesize

            void *pagebuf;

            /* mmap page temporarily somewhere
             *
             * ( we cannot use present page mapping in some vma directly,
             *   because while storeblk is called with virtmem lock released that
             *   mapping can go away ) */
            pagebuf = page_mmap(page, NULL, PROT_READ);
            TODO(!pagebuf); // XXX err

            /* unlock virtmem before calling storeblk()
             *
             * that call is potentially slow and external code can take other
             * locks. If that "other locks" are also taken before external code
             * calls e.g. fileh_invalidate_page() in different codepath a deadlock
             * can happen. (similar to loadblk case) */
            virt_unlock();

            err = file->file_ops->storeblk(file, blk, pagebuf);

            /* relock virtmem */
            virt_lock();

            xmunmap(pagebuf, page_size(page));

            if (err)
                goto out;
        }

        /* wcfs:  remmap RW pages to base layer
         * !wcfs: page.state -> PAGE_LOADED and correct mappings RW -> R
         *
         * NOTE for transaction storage (ZODB and ZBigFile) storeblk creates
         * new transaction on database side, but does not update current DB
         * connection to view that transaction. Thus if loadblk will be loaded
         * with not-yet-resynced DB connection, it will return old - not stored
         * - data. For !wcfs case this is partly mitigated by the fact that
         * stored pages are kept as PAGE_LOADED in ram, but it cannot be
         * relied as ram_reclaim can drop those pages and read access to them
         * will trigger loadblk from database which will return old data.
         * For wcfs case remapping to base layer will always return old data
         * until wcfs mapping is updated to view database at newer state.
         *
         * In general it is a bug to access data pages in between transactions,
         * so we accept those corner case difference in between wcfs and !wcfs.
         */
        if (flags & WRITEOUT_MARKSTORED) {
            page->state = PAGE_LOADED;
            list_del_init(&page->in_dirty);

            list_for_each(hmmap, &fileh->mmaps) {
                VMA *vma = list_entry(hmmap, typeof(*vma), same_fileh);
                if (fileh->mmap_overlay) {
                    /* wcfs:  RW -> base layer */
                    vma_page_ensure_unmapped(vma, page);
                } else {
                    /* !wcfs: RW -> R*/
                    vma_page_ensure_notmappedrw(vma, page);
                }
            }

            /* wcfs:  all vmas are using base layer now - drop page completely
             *        without unnecessarily growing RSS and relying on reclaim.
             * !wcfs: keep the page in RAM cache, even if it is not mapped anywhere */
            if (fileh->mmap_overlay) {
                ASSERT(page->refcnt == 0);
                // XXX -> page_drop(page)
                pagemap_del(&fileh->pagemap, page->f_pgoffset);
                page_drop_memory(page);
                page_del(page);
            }
        }
    }


    /* if we successfully finished with markstored flag set - all dirty pages
     * should become non-dirty */
    if (flags & WRITEOUT_MARKSTORED)
        BUG_ON(!list_empty(&fileh->dirty_pages));

out:
    fileh->writeout_inprogress = 0;

    virt_unlock();
    sigsegv_restore(&save_sigset);
    return err;
}


void fileh_dirty_discard(BigFileH *fileh)
{
    Page *page;
    struct list_head *hpage, *hpage_next;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

    /* discard is not allowed to run in parallel to writeout */
    BUG_ON(fileh->writeout_inprogress);

    list_for_each_safe(hpage, hpage_next, &fileh->dirty_pages) {
        page = list_entry(hpage, typeof(*page), in_dirty);
        BUG_ON(page->state != PAGE_DIRTY);

        // FIXME do pagemap_del + drop_ram + page_del unconditionally
        //       (just need think again and to update !wcfs discard test)
        if (fileh->mmap_overlay) {
            pagemap_del(&fileh->pagemap, page->f_pgoffset);
            page_drop_memory(page);
            page_del(page);
        } else {
            page_drop_memory(page);
        }
    }

    BUG_ON(!list_empty(&fileh->dirty_pages));

    virt_unlock();
    sigsegv_restore(&save_sigset);
}


/****************
 * INVALIDATION *
 ****************/

void fileh_invalidate_page(BigFileH *fileh, pgoff_t pgoffset)
{
    Page *page;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

    /* it's an error to invalidate fileh while writeout is in progress */
    BUG_ON(fileh->writeout_inprogress);
    // XXX wcfs: invalidate_page must not be called (wcfs handles invalidations itself)
    // XXX       or allow invalidate anyway (e.g. DIRTY -> base) ?
    // XXX yes -> allow invalidate for wcfs too - means forget in-ram page and mmap to base memory

    page = pagemap_get(&fileh->pagemap, pgoffset);
    if (page) {
        /* for pages where loading is in progress, we just remove the page from
         * pagemap and mark it to be dropped by their loaders after it is done.
         * In the mean time, as pagemap entry is now empty, on next access to
         * the memory the page will be created/loaded anew */
        if (page->state == PAGE_LOADING) {
            pagemap_del(&fileh->pagemap, pgoffset);
            page->state = PAGE_LOADING_INVALIDATED;
        }
        /* else we just make sure to drop page memory */
        else {
            page_drop_memory(page);
            // XXX + page_del       ?
            // XXX + pagemap_del    ?
        }
    }

    virt_unlock();
    sigsegv_restore(&save_sigset);
}


/************************
 *  Lookup VMA by addr  *
 ************************/

/* list of all registered VMA(s) */
static LIST_HEAD(vma_list);

/* protects ^^^  XXX */
//spinlock_t vma_list_lock;



/* lookup VMA covering `addr`. NULL if not found
 * (should be called with virtmem lock held) */
VMA *virt_lookup_vma(void *addr)
{
    uintptr_t uaddr = (uintptr_t)addr;
    struct list_head *h;
    VMA *vma;

    list_for_each(h, &vma_list) {
        // XXX -> list_for_each_entry
        vma = list_entry(h, typeof(*vma), virt_list);
        if (uaddr < vma->addr_stop)
            /*
             * here:  vma->addr_start ? uaddr < vma->addr_stop
             * vma->addr_stop is first such addr_stop
             */
            return (vma->addr_start <= uaddr) ? vma : NULL;
    }

    return NULL;    /* not found at all or no overlap */
}


/* register VMA `vma` as covering some file view
 * (should be called with virtmem lock held) */
void virt_register_vma(VMA *vma)
{
    uintptr_t uaddr = vma->addr_start;
    struct list_head *h;
    struct VMA *a;

    list_for_each(h, &vma_list) {
        a = list_entry(h, typeof(*a), virt_list);
        if (uaddr < a->addr_stop)
            break;
    }

    /* either before found vma or, if not found, at the end of the list */
    list_add_tail(&vma->virt_list, h);
}


/* remove `area` from VMA registry. `area` must be registered before
 * (should be called with virtmem lock held) */
void virt_unregister_vma(VMA *vma)
{
    /* _init - to clear links, just in case */
    list_del_init(&vma->virt_list);
}


/*****************************************/

/*
 * allocate virtual memory address space
 * the pages are initially protected to prevent any access
 *
 * @addr    NULL - at anywhere,     !NULL - exactly there
 * @return  !NULL - mapped there    NULL - error
 */
void *mem_valloc(void *addr, size_t len)
{
    void *a;
    a = mmap(addr, len, PROT_NONE,
            MAP_PRIVATE | MAP_ANONYMOUS
            /* don't try to (pre-)allocate memory - just virtual address space */
            | MAP_NORESERVE
            | (addr ? MAP_FIXED : 0),
            -1, 0);

    if (a == MAP_FAILED)
        a = NULL;

    if (a && addr)
        /* verify OS respected our MAP_FIXED request */
        BUG_ON(a != addr);

    return a;
}


/* like mem_valloc() but allocation must not fail */
void *mem_xvalloc(void *addr, size_t len)
{
    void *a;
    a = mem_valloc(addr, len);
    BUG_ON(!a);
    return a;
}


/*********************
 * PAGEFAULT HANDLER *
 *********************/

/* pagefault entry when we know request came to our memory area
 *
 * (virtmem_lock already taken by caller)   */
VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
{
    pgoff_t pagen;
    Page *page;
    BigFileH *fileh;
    struct list_head *hmmap;

    /* continuing on_pagefault() - see (1) there ... */

    /* (2) vma, addr -> fileh, pagen    ;idx of fileh page covering addr */
    fileh = vma->fileh;
    pagen = vma_addr_fpgoffset(vma, addr);

    /* wcfs: we should get into SIGSEGV handler only on write access */
    if (fileh->mmap_overlay)
        BUG_ON(!write);

    /* (3) fileh, pagen -> page  (via pagemap) */
    page = pagemap_get(&fileh->pagemap, pagen);

    /* wcfs: all dirty pages are mmapped when vma is created.
     *       thus here, if page is present in pagemap, it can be only either
     *       - a page we just loaded for dirtying, or
     *       - a page that is in progress of being loaded.
     *
     * ( PAGE_LOADED_FOR_WRITE is used only to verify that in wcfs mode we
     *   always keep all dirty pages mmapped on fileh_open and so pagefault
     *   handler must not see a PAGE_LOADED page. )
     */
    if (fileh->mmap_overlay && page)
        ASSERT(page->state == PAGE_LOADED_FOR_WRITE || page->state == PAGE_LOADING);

    /* (4) no page found - allocate new from ram */
    while (!page) {
        page = ramh_alloc_page(fileh->ramh, pagen);
        if (!page) {
            /* try to release some memory back to OS */
            // XXX do we need and how to distinguish "no ram page" vs "no memory for `struct page`"?
            //     -> no we don't -- better allocate memory for struct pages for whole RAM at ram setup
            if (!__ram_reclaim(fileh->ramh->ram))
                OOM();
            continue;
        }

        /* ramh set up .ramh, .ramh_pgoffset, .state?
         * now setup rest (link to fileh)  */
        page->fileh      = fileh;
        page->f_pgoffset = pagen;

        /* remember page in fileh->pagemap[pagen] */
        pagemap_set(&fileh->pagemap, pagen, page);
    }

    /* (5a) if page was not yet loaded - start loading it */
    if (page->state == PAGE_EMPTY) {
        /* NOTE if we load data in-place, there would be a race with concurrent
         * access to the page here - after first enabling memory-access to
         * the page, other threads could end up reading corrupt data, while
         * loading had not finished.
         *
         * so to avoid it we first load data to separate memory address, then
         * mmap-duplicate that page into here, but it is more work compared to
         * what kernel internally does.
         *
         * TODO try to use remap_anon_pages() when it is ready
         *      (but unfortunately it is only for anonymous memory)
         * NOTE remap_file_pages() is going away...
         */
        blk_t blk;
        void *pageram;
        int err;
        BigFile *file;

        /*
         * if pagesize < blksize - need to prepare several adjacent pages for blk;
         * if pagesize > blksize - will need to either 1) rescan which blk got
         *    dirty, or 2) store not-even-touched blocks adjacent to modified one.
         */
        file = fileh->file;
        TODO (file->blksize != page_size(page));

        // FIXME doing this mmap-to-temp/unmap is somewhat costly. Better
        // constantly have whole RAM mapping somewhere R/W and load there.
        // (XXX but then we'll either have
        //    - VMA fragmented  (if we manage whole RAM as 1 file of physram size),
        //    - or need to waste a lot of address space (size of each ramh can be very large)
        //
        //    generally this way it also has major problems)
        //
        // Also this way, we btw don't need to require python code to drop all
        // references to loading buf.

        /* mmap page memory temporarily somewhere
         * XXX better pre-map all ram pages r/w in another area to not need to mmap/unmap it here
         *     -> will run slightly faster (but major slowdown is in clear_page in kernel)
         */
        // TODO MAP_UNINITIALIZED somehow? (we'll overwrite that memory)
        pageram = page_mmap(page, NULL, PROT_READ | PROT_WRITE);
        TODO(!pageram); // XXX err

        /* load block -> pageram memory */
        blk = page->f_pgoffset;     // NOTE because blksize = pagesize

        /* mark page as loading and unlock virtmem before calling loadblk()
         *
         * that call is potentially slow and external code can take other
         * locks. If that "other locks" are also taken before external code
         * calls e.g. fileh_invalidate_page() in different codepath a deadlock
         * can happen. (similar to storeblk case) */
        page->state = PAGE_LOADING;
        virt_unlock();

        if (fileh->mmap_overlay) {
            /* wcfs: copy block data from read-only base mmap.
             * NOTE we'll get SIGBUG here if wcfs returns EIO when loading block data */
            memcpy(pageram, vma_page_addr(vma, page), page_size(page));
        }
        else {
            /* !wcfs: call loadblk */
            err = file->file_ops->loadblk(file, blk, pageram);

            /* TODO on error -> try to throw exception somehow to the caller, so
             *      that it can abort current transaction, but not die.
             *
             * NOTE for analogue situation when read for mmaped file fails, the
             *      kernel sends SIGBUS
             */
            TODO (err);
        }

        /* relock virtmem */
        virt_lock();

        xmunmap(pageram, page_size(page));

        /* if the page was invalidated while we were loading it, we have to drop
         * it's memory and the Page structure completely - invalidater already
         * removed it from pagemap */
        if (page->state == PAGE_LOADING_INVALIDATED) {
            page_drop_memory(page);
            page_del(page);
            // XXX + pagemap_del    ?
//            list_del(&page->lru);
//            bzero(page, sizeof(*page)); /* just in case */
//            free(page);
        }

        /* else just mark the page as loaded ok */
        else
            page->state = (write ? PAGE_LOADED_FOR_WRITE : PAGE_LOADED);

        /* we have to retry the whole fault, because the vma could have been
         * changed while we were loading page with virtmem lock released */
        return VM_RETRY;
    }

    /* (5b) page is currently being loaded by another thread - wait for load to complete
     *
     * NOTE a page is protected from being concurrently loaded by two threads at
     * the same time via:
     *
     *   - virtmem lock - we get/put pages from fileh->pagemap only under it
     *   - page->state is set PAGE_LOADING for loading in progress pages
     *   - such page is inserted in fileh->pagepam
     *
     * so if second thread faults at the same memory page, and the page is
     * still loading, it will find the page in PAGE_LOADING state and will just
     * wait for it to complete. */
    if (page->state == PAGE_LOADING) {
        /* XXX polling instead of proper completion */
        void *gilstate;
        virt_unlock();
        gilstate = virt_gil_ensure_unlocked();
        usleep(10000);  // XXX with 1000 usleep still busywaits
        virt_gil_retake_if_waslocked(gilstate);
        virt_lock();
        return VM_RETRY;
    }

    /* (6) page data ready. Mmap it atomically into vma address space, or mprotect
     * appropriately if it was already mmaped. */
    PageState newstate = PAGE_LOADED;   // XXX vvv PAGE_LOADED_FOR_WRITE ok?
    if (write || page->state == PAGE_DIRTY || page->state == PAGE_LOADED_FOR_WRITE) {
        newstate = PAGE_DIRTY;
    }

    // XXX also call page->markdirty() ?
    if (newstate == PAGE_DIRTY  &&  newstate != page->state) {
        /* it is not allowed to modify pages while writeout is in progress */
        BUG_ON(fileh->writeout_inprogress);

        list_add_tail(&page->in_dirty, &fileh->dirty_pages);
    }
    page->state = max(page->state, newstate);

    // XXX overlay: assert !vma->page_ismappedv[blk]    XXX not ok? (retrying after virt unlock/lock)
    // XXX mmap page to all vma with .mmap_overlay=1 of this fileh.

    vma_mmap_page(vma, page);
    /* wcfs: mmap the page to all wcfs-backed vmas. If we don't, the memory on
     * those vmas will read with stale data */
    if (fileh->mmap_overlay) {
        list_for_each(hmmap, &fileh->mmaps) {
            VMA *vma2 = list_entry(hmmap, typeof(*vma2), same_fileh);
            if (vma2 != vma)
                vma_mmap_page(vma2, page);
        }
    }

    /* mark page as used recently */
    // XXX = list_move_tail()
    list_del(&page->lru);
    list_add_tail(&page->lru, &page->ramh->ram->lru_list);

    /*
     * (7) access to page prepared - now it is ok to return from signal handler
     *     - the caller will re-try executing faulting instruction.
     */
    return VM_HANDLED;
}


/***********
 * RECLAIM *
 ***********/

#define RECLAIM_BATCH   64      /* how many pages to reclaim at once */
static int __ram_reclaim(RAM *ram)
{
    struct list_head *lru_list = &ram->lru_list;
    struct list_head *hlru;
    Page *page;
    int batch = RECLAIM_BATCH, scanned = 0;

    TRACE("RAM_RECLAIM\n");
    hlru = lru_list->next;

    while (batch && hlru != lru_list) {
        page = list_entry(hlru, typeof(*page), lru);
        hlru = hlru->next;
        scanned++;

        /* can release ram only from loaded non-dirty pages
         * NOTE PAGE_LOADING pages are not dropped - they just continue to load */
        if (page->state == PAGE_LOADED || page->state == PAGE_LOADED_FOR_WRITE) {
            page_drop_memory(page);
            batch--;
        }

        /* PAGE_EMPTY pages without mappers go away */
        // XXX merge vvv with ^^^ : page_drop_memory + pagemap_del + page_del
        if (page->state == PAGE_EMPTY) {
            BUG_ON(page->refcnt != 0);  // XXX what for then we have refcnt? -> vs discard

            /* delete page & its entry in fileh->pagemap */
            pagemap_del(&page->fileh->pagemap, page->f_pgoffset);
            page_del(page);
//            list_del(&page->lru);
//            bzero(page, sizeof(*page)); /* just in case */
//            free(page);
        }

    }

    TRACE("\t-> reclaimed %i  scanned %i\n", RECLAIM_BATCH - batch, scanned);
    return RECLAIM_BATCH - batch;
}


int ram_reclaim(RAM *ram)
{
    int ret;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

    ret = __ram_reclaim(ram);

    virt_unlock();
    sigsegv_restore(&save_sigset);
    return ret;
}


/********************
 * Internal helpers *
 ********************/

static size_t page_size(const Page *page)
{
    return page->ramh->ram->pagesize;
}

void page_incref(Page *page)
{
    page->refcnt++;     // XXX atomically ?
}

void page_decref(Page *page)
{
    page->refcnt--;     // XXX atomically ?
    BUG_ON(page->refcnt < 0);

    // TODO if unused delete self && clear pagemap ?
    // XXX  if dirty -> delete = not ok
}

void *page_mmap(Page *page, void *addr, int prot)
{
    RAMH *ramh = page->ramh;
    // XXX better call ramh_mmap_page() without tinkering with ramh_ops?
    return ramh->ramh_ops->mmap_page(ramh, page->ramh_pgoffset, addr, prot);
}


// XXX -> page_drop = drop memory, delete page from pagemap, delete page
static void page_drop_memory(Page *page)
{
    /* Memory for this page goes out. 1) unmap it from all mmaps */
    struct list_head *hmmap;

    /* NOTE we try not to drop memory for loading-in-progress pages.
     *      so if this is called for such a page - it is a bug. */
    BUG_ON(page->state == PAGE_LOADING);
    /* same for storing-in-progress */
    BUG_ON(page->fileh->writeout_inprogress && page->state == PAGE_DIRTY);

    if (page->state == PAGE_EMPTY)
        return;

    list_for_each(hmmap, &page->fileh->mmaps) {
        VMA *vma = list_entry(hmmap, typeof(*vma), same_fileh);
        vma_page_ensure_unmapped(vma, page);
    }

    /* 2) release memory to ram */
    ramh_drop_memory(page->ramh, page->ramh_pgoffset);
    if (page->state == PAGE_DIRTY)
        list_del_init(&page->in_dirty);
    page->state = PAGE_EMPTY;

    // XXX touch lru?
}

/* page_del deletes Page struct (but not page memory - see page_drop_memory).
 *
 * The page is removed from ram->lru.
 */
static void page_del(Page *page) {
    BUG_ON(page->refcnt != 0);
    BUG_ON(page->state == PAGE_DIRTY);  // XXX + PAGE_LOADING ?

    list_del(&page->lru);
    bzero(page, sizeof(*page)); /* just in case */
    free(page);
}



/* vma: page -> addr  where it should-be mmaped in vma */
static void *vma_page_addr(VMA *vma, Page *page)
{
    uintptr_t addr;
    ASSERT(vma->fileh == page->fileh);      // XXX needed here?

    addr = vma->addr_start + (page->f_pgoffset - vma->f_pgoffset) * page_size(page);
    ASSERT(vma->addr_start <= addr  &&
                              addr < vma->addr_stop);
    return (void *)addr;
}


/* vma: addr -> fileh pgoffset  with page containing addr */
static pgoff_t vma_addr_fpgoffset(VMA *vma, uintptr_t addr)
{
    return vma->f_pgoffset + (addr - vma->addr_start) / vma->fileh->ramh->ram->pagesize;
}


/* vma_mmap_page mmaps page into vma.
 *
 * the page must belong to covered file.
 * mmap protection is PROT_READ if page is PAGE_LOADED or PROT_READ|PROT_WRITE
 * if page is PAGE_DIRTY.
 *
 * must be called under virtmem lock.
 */
static void vma_mmap_page(VMA *vma, Page *page) {
    pgoff_t pgoff_invma;
    int prot = (page->state == PAGE_DIRTY ? PROT_READ|PROT_WRITE : PROT_READ);

    ASSERT(page->state == PAGE_LOADED || page->state == PAGE_DIRTY);
    ASSERT(vma->f_pgoffset <= page->f_pgoffset &&
                              page->f_pgoffset < vma_addr_fpgoffset(vma, vma->addr_stop));

    pgoff_invma = page->f_pgoffset - vma->f_pgoffset;
    if (!bitmap_test_bit(vma->page_ismappedv, pgoff_invma)) {
        // XXX err
        page_mmap(page, vma_page_addr(vma, page), prot);
        bitmap_set_bit(vma->page_ismappedv, pgoff_invma);
        page_incref(page);
    }
    else {
        /* just changing protection bits should not fail, if parameters ok */
        xmprotect(vma_page_addr(vma, page), page_size(page), prot);
    }
}

/* is `page` mapped to `vma` */
static int vma_page_ismapped(VMA *vma, Page *page)
{
    pgoff_t vma_fpgstop;
    ASSERT(vma->fileh == page->fileh);

    vma_fpgstop = vma_addr_fpgoffset(vma, vma->addr_stop);
    if (!(vma->f_pgoffset <= page->f_pgoffset  &&
                             page->f_pgoffset < vma_fpgstop))
        return 0;

    return bitmap_test_bit(vma->page_ismappedv, page->f_pgoffset - vma->f_pgoffset);
}


/* ensure `page` is not mapped to `vma` */
static void vma_page_ensure_unmapped(VMA *vma, Page *page)
{
    if (!vma_page_ismapped(vma, page))
        return;

    if (vma->fileh->mmap_overlay) {
        /* wcfs: remmap readonly to base image */
        BigFile *file = vma->fileh->file;
        int err;

        TODO (file->blksize != page_size(page));
        err = file->file_ops->remmap_blk_read(file, /* blk = */page->f_pgoffset, vma);
        BUG_ON(err); /* must not fail */
    }
    else {
        /* !wcfs: mmap empty PROT_NONE address space instead of page memory */
        mem_xvalloc(vma_page_addr(vma, page), page_size(page));
    }

    bitmap_clear_bit(vma->page_ismappedv, page->f_pgoffset - vma->f_pgoffset);
    page_decref(page);
}


/* ensure `page` is not mapped RW to `vma`
 *
 * if mapped -> should be mapped as R
 * if not mapped - leave as is
 */
static void vma_page_ensure_notmappedrw(VMA *vma, Page *page)
{
    if (!vma_page_ismapped(vma, page))
        return;

    /* just changing protection - should not fail */
    // XXX PROT_READ always? (it could be mmaped with PROT_NONE before without
    // first access) - then it should not be mapped in page_ismappedv -> ok.
    xmprotect(vma_page_addr(vma, page), page_size(page), PROT_READ);
}

/* return whether fileh page is dirty or not.
 *
 * must be called under virtmem lock.
 */
bool __fileh_page_isdirty(BigFileH *fileh, pgoff_t pgoffset)
{
    Page *page;

    // XXX sigsegv_block ?

    page = pagemap_get(&fileh->pagemap, pgoffset);
    if (!page)
        return false;

    return (page->state == PAGE_DIRTY);
}


// XXX stub
void OOM(void)
{
    BUG();
}