virtmem.c 35.6 KB
Newer Older
Kirill Smelkov's avatar
Kirill Smelkov committed
1
/* Wendelin.bigfile | Virtual memory
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
2
 * Copyright (C) 2014-2019  Nexedi SA and Contributors.
Kirill Smelkov's avatar
Kirill Smelkov committed
3 4 5 6 7 8 9
 *                          Kirill Smelkov <kirr@nexedi.com>
 *
 * This program is free software: you can Use, Study, Modify and Redistribute
 * it under the terms of the GNU General Public License version 3, or (at your
 * option) any later version, as published by the Free Software Foundation.
 *
 * You can also Link and Combine this program with other software covered by
10 11 12 13
 * the terms of any of the Free Software licenses or any of the Open Source
 * Initiative approved licenses and Convey the resulting work. Corresponding
 * source of such a combination shall include the source code for all other
 * software used.
Kirill Smelkov's avatar
Kirill Smelkov committed
14 15 16 17 18
 *
 * This program is distributed WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * See COPYING file for full licensing terms.
19
 * See https://www.nexedi.com/licensing for rationale and options.
20 21 22
 *
 *
 * TODO description
Kirill Smelkov's avatar
Kirill Smelkov committed
23 24 25
 */

#include <wendelin/bigfile/virtmem.h>
26 27
#include <wendelin/bigfile/file.h>
#include <wendelin/bigfile/pagemap.h>
Kirill Smelkov's avatar
Kirill Smelkov committed
28
#include <wendelin/bigfile/ram.h>
29
#include <wendelin/utils.h>
30 31 32
#include <wendelin/bug.h>

#include <ccan/minmax/minmax.h>
Kirill Smelkov's avatar
Kirill Smelkov committed
33 34 35

#include <sys/mman.h>
#include <errno.h>
36 37
#include <signal.h>
#include <stdio.h>
38
#include <unistd.h>
39 40 41

static size_t   page_size(const Page *page);
static void     page_drop_memory(Page *page);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
42
static void     page_del(Page *page);
43 44
static void    *vma_page_addr(VMA *vma, Page *page);
static pgoff_t  vma_addr_fpgoffset(VMA *vma, uintptr_t addr);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
45
static void     vma_mmap_page(VMA *vma, Page *page);
46 47 48
static int      vma_page_ismapped(VMA *vma, Page *page);
static void     vma_page_ensure_unmapped(VMA *vma, Page *page);
static void     vma_page_ensure_notmappedrw(VMA *vma, Page *page);
49
static int      __ram_reclaim(RAM *ram);
50 51 52 53 54 55 56 57

#define VIRT_DEBUG   0
#if VIRT_DEBUG
# define TRACE(msg, ...) do { fprintf(stderr, msg, ## __VA_ARGS__); } while (0)
#else
# define TRACE(msg, ...) do {} while(0)
#endif

58 59 60 61

/* global lock which protects manipulating virtmem data structures
 *
 * NOTE not scalable, but this is temporary solution - as we are going to move
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
62
 * memory management back into the kernel, where it is done properly. */
63
static pthread_mutex_t virtmem_lock = PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP;
64 65
static const VirtGilHooks *virtmem_gilhooks;

66
void *virt_gil_ensure_unlocked(void)
67 68 69 70 71 72
{
    void *gilstate = NULL;

    if (virtmem_gilhooks)
        gilstate = virtmem_gilhooks->gil_ensure_unlocked();

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
    return gilstate;
}


void virt_gil_retake_if_waslocked(void *gilstate)
{
    if (gilstate)
        virtmem_gilhooks->gil_retake_if_waslocked(gilstate);
}

void virt_lock()
{
    void *gilstate = NULL;

    /* make sure we don't hold e.g. python GIL (not to deadlock, as GIL oscillates) */
    gilstate = virt_gil_ensure_unlocked();

90 91 92 93
    /* acquire virtmem lock */
    xpthread_mutex_lock(&virtmem_lock);

    /* retake GIL if we were holding it originally */
94
    virt_gil_retake_if_waslocked(gilstate);
95 96 97 98 99 100 101 102 103 104
}

void virt_unlock()
{
    xpthread_mutex_unlock(&virtmem_lock);
}


void virt_lock_hookgil(const VirtGilHooks *gilhooks)
{
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
105 106 107 108 109 110 111 112 113 114
    // FIXME we hit vvv assert for real because `import bigfile,
    // wendelin.bigfile` import bigfile/__init__.py twice and that in turn
    // imports bigfile/_bigfile.so twice. However Python loads _bigfile.so DSO
    // only once - oops. The bug happens in practice when running tests via
    // pytest under python3.
    //
    // XXX temp hack to workaround that bug for now.
    if (virtmem_gilhooks == gilhooks)
	    virtmem_gilhooks = NULL;

115 116 117 118 119
    BUG_ON(virtmem_gilhooks);       /* prevent registering multiple times */
    virtmem_gilhooks = gilhooks;
}


120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
/* block/restore SIGSEGV for current thread - non on-pagefault code should not
 * access any not-mmapped memory -> so on any pagefault we should just die with
 * coredump, not try to incorrectly handle the pagefault.
 *
 * NOTE sigmask is per-thread. When blocking there is no race wrt other threads
 * correctly accessing data via pagefaulting.   */
static void sigsegv_block(sigset_t *save_sigset)
{
    sigset_t mask_segv;
    xsigemptyset(&mask_segv);
    xsigaddset(&mask_segv, SIGSEGV);
    xpthread_sigmask(SIG_BLOCK, &mask_segv, save_sigset);
}

static void sigsegv_restore(const sigset_t *save_sigset)
{
136 137 138 139 140 141
    int how = xsigismember(save_sigset, SIGSEGV) ? SIG_BLOCK : SIG_UNBLOCK;
    sigset_t mask_segv;

    xsigemptyset(&mask_segv);
    xsigaddset(&mask_segv, SIGSEGV);
    xpthread_sigmask(how, &mask_segv, NULL);
142 143
}

144 145 146 147 148

/****************
 * OPEN / CLOSE *
 ****************/

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
149
int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram, FileHOpenFlags flags)
150
{
151 152
    int err = 0;
    sigset_t save_sigset;
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
    const bigfile_ops *fops = file->file_ops;

    if (!(flags == 0 || flags == MMAP_OVERLAY || flags == DONT_MMAP_OVERLAY))
        return -EINVAL;
    if (flags == 0) {
        flags = fops->mmap_setup_read ? MMAP_OVERLAY : DONT_MMAP_OVERLAY;
    }
    if (flags & MMAP_OVERLAY && flags & DONT_MMAP_OVERLAY)
        return -EINVAL;
    if (flags == MMAP_OVERLAY) {
        ASSERT(fops->mmap_setup_read);
        ASSERT(fops->remmap_blk_read);
    }
    if (flags == DONT_MMAP_OVERLAY) {
        ASSERT(fops->loadblk);
    }
169 170

    sigsegv_block(&save_sigset);
171
    virt_lock();
172

173 174
    bzero(fileh, sizeof(*fileh));
    fileh->ramh = ramh_open(ram);
175 176
    if (!fileh->ramh) {
        err = -1;
177
        goto out;
178
    }
179 180 181

    fileh->file = file;
    INIT_LIST_HEAD(&fileh->mmaps);
182
    INIT_LIST_HEAD(&fileh->dirty_pages);
183
    fileh->writeout_inprogress = 0;
184 185
    pagemap_init(&fileh->pagemap, ilog2_exact(ram->pagesize));

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
186
    fileh->mmap_overlay = (flags == MMAP_OVERLAY);
187

188
out:
189
    virt_unlock();
190 191
    sigsegv_restore(&save_sigset);
    return err;
192 193 194 195 196 197
}


void fileh_close(BigFileH *fileh)
{
    Page *page;
198 199 200
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
201
    virt_lock();
202 203 204 205 206 207

    /* it's an error to close fileh with existing mappings */
    // XXX implement the same semantics usual files have wrt mmaps - if we release
    // fileh, but mapping exists - real fileh release is delayed to last unmap ?
    BUG_ON(!list_empty(&fileh->mmaps));

208 209 210
    /* it's an error to close fileh while writeout is in progress */
    BUG_ON(fileh->writeout_inprogress);

211 212
    /* drop all pages (dirty or not) associated with this fileh */
    pagemap_for_each(page, &fileh->pagemap) {
213 214 215
        /* it's an error to close fileh to mapping of which an access is
         * currently being done in another thread */
        BUG_ON(page->state == PAGE_LOADING);
216
        page_drop_memory(page);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
217 218 219 220
        page_del(page);
//        list_del(&page->lru);
//        bzero(page, sizeof(*page)); /* just in case */
//        free(page);
221 222
    }

223 224
    BUG_ON(!list_empty(&fileh->dirty_pages));

225 226 227 228 229 230 231
    /* and clear pagemap */
    pagemap_clear(&fileh->pagemap);

    if (fileh->ramh)
        ramh_close(fileh->ramh);

    bzero(fileh, sizeof(*fileh));
232
    virt_unlock();
233
    sigsegv_restore(&save_sigset);
234 235 236 237 238 239 240 241 242 243 244 245
}



/****************
 * MMAP / UNMAP *
 ****************/

int fileh_mmap(VMA *vma, BigFileH *fileh, pgoff_t pgoffset, pgoff_t pglen)
{
    void *addr;
    size_t len = pglen * fileh->ramh->ram->pagesize;
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
246 247
    BigFile *file = fileh->file;
    const bigfile_ops *fops = file->file_ops;
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
248
    int err = 0;
249 250 251
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
252
    virt_lock();
253

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
254
    /* start preparing vma */
255
    bzero(vma, sizeof(*vma));
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
256 257
    vma->fileh       = fileh;
    vma->f_pgoffset  = pgoffset;
258

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
259
    /* alloc vma->page_ismappedv[] */
260 261
    vma->page_ismappedv = bitmap_alloc0(pglen);
    if (!vma->page_ismappedv)
262
        goto fail;
263

264
    if (fileh->mmap_overlay) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
265
        /* wcfs: mmap(base, READ) */
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
266 267 268
        TODO (file->blksize != fileh->ramh->ram->pagesize);
        addr = fops->mmap_setup_read(file, pgoffset, pglen, vma);
    } else {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
269
        /* !wcfs: allocate address space somewhere */
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
270 271
        addr = mem_valloc(NULL, len);
    }
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
272 273
    if (!addr)
        goto fail;
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
274

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
275
    /* vma address range known */
276 277 278
    vma->addr_start  = (uintptr_t)addr;
    vma->addr_stop   = vma->addr_start + len;

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
279
    /* wcfs: mmap(fileh->dirty_pages) over base */
280
    if (fileh->mmap_overlay) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
281 282 283 284 285 286 287 288 289 290
        Page* page;
        struct list_head *hpage;

        list_for_each(hpage, &fileh->dirty_pages) {
            page = list_entry(hpage, typeof(*page), in_dirty);
            BUG_ON(page->state != PAGE_DIRTY);

            if (!(pgoffset <= page->f_pgoffset && page->f_pgoffset < pgoffset + pglen))
                continue; /* page is out of requested mmap coverage */

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
291
            // XXX notify watcher that we mmap RAM page in its range? -> no need
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
292
            vma_mmap_page(vma, page);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
293 294 295 296
        }
    }

    /* everything allocated - link it up */
297 298 299 300 301 302 303 304

    // XXX need to init vma->virt_list first?
    /* hook vma to fileh->mmaps */
    list_add_tail(&vma->same_fileh, &fileh->mmaps);

    /* register vma for pagefault handling */
    virt_register_vma(vma);

305
out:
306
    virt_unlock();
307 308
    sigsegv_restore(&save_sigset);
    return err;
309

310
fail:
311
    free(vma->page_ismappedv);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
312
    bzero(vma, sizeof(*vma));
313 314
    err = -1;
    goto out;
315 316 317 318 319 320 321 322 323 324 325
}


void vma_unmap(VMA *vma)
{
    BigFileH *fileh = vma->fileh;
    size_t   len    = vma->addr_stop - vma->addr_start;
    size_t   pglen  = len / fileh->ramh->ram->pagesize;
    int i;
    pgoff_t  pgoffset;
    Page *page;
326
    sigset_t save_sigset;
327

328
    sigsegv_block(&save_sigset);
329
    virt_lock();
330

331 332 333 334 335 336 337 338 339
    /* unregister from vmamap - so that pagefault handler does not recognize
     * this area as valid */
    virt_unregister_vma(vma);

    /* unlink from fileh.mmaps   XXX place ok ? */
    list_del_init(&vma->same_fileh);

    /* unmap whole vma at once - the kernel unmaps each mapping in turn.
     * NOTE error here would mean something is broken */
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
340 341 342 343 344
    if (fileh->mmap_overlay) {
        fileh->file->file_ops->munmap(fileh->file, vma);
    } else {
        xmunmap((void *)vma->addr_start, len);
    }
Kirill Smelkov's avatar
Kirill Smelkov committed
345

346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
    /* scan through mapped-to-this-vma pages and release them */
    for (i=0; i < pglen; ++i) {
        if (!bitmap_test_bit(vma->page_ismappedv, i))
            continue;

        pgoffset = vma->f_pgoffset + i;
        page = pagemap_get(&fileh->pagemap, pgoffset);
        BUG_ON(!page);
        page_decref(page);
    }

    /* free memory and be done */
    free(vma->page_ismappedv);

    bzero(vma, sizeof(*vma));
361
    virt_unlock();
362
    sigsegv_restore(&save_sigset);
363 364 365 366 367 368 369
}


/**********************
 * WRITEOUT / DISCARD *
 **********************/

370 371 372 373 374 375 376 377 378 379 380 381 382
/* helper for sorting dirty pages by ->f_pgoffset */
static int hpage_indirty_cmp_bypgoffset(struct list_head *hpage1, struct list_head *hpage2, void *_)
{
    Page *page1 = list_entry(hpage1, typeof(*page1), in_dirty);
    Page *page2 = list_entry(hpage2, typeof(*page2), in_dirty);

    if (page1->f_pgoffset < page2->f_pgoffset)
        return -1;
    if (page1->f_pgoffset > page2->f_pgoffset)
        return +1;
    return 0;
}

383 384 385 386
int fileh_dirty_writeout(BigFileH *fileh, enum WriteoutFlags flags)
{
    Page *page;
    BigFile *file = fileh->file;
387
    struct list_head *hpage, *hpage_next, *hmmap;
388
    sigset_t save_sigset;
389 390 391 392 393 394 395
    int err = 0;

    /* check flags */
    if (!(flags &  (WRITEOUT_STORE | WRITEOUT_MARKSTORED))   ||
          flags & ~(WRITEOUT_STORE | WRITEOUT_MARKSTORED))
        return -EINVAL;

396
    sigsegv_block(&save_sigset);
397
    virt_lock();
398

399 400 401 402
    /* concurrent writeouts are not allowed */
    BUG_ON(fileh->writeout_inprogress);
    fileh->writeout_inprogress = 1;

403 404 405 406
    /* pages are stored (if stored) in sorted order */
    if (flags & WRITEOUT_STORE)
        list_sort(&fileh->dirty_pages, hpage_indirty_cmp_bypgoffset, NULL);

407
    /* write out dirty pages */
408 409 410
    list_for_each_safe(hpage, hpage_next, &fileh->dirty_pages) {
        page = list_entry(hpage, typeof(*page), in_dirty);
        BUG_ON(page->state != PAGE_DIRTY);
411 412 413 414 415 416 417 418

        /* ->storeblk() */
        if (flags & WRITEOUT_STORE) {
            TODO (file->blksize != page_size(page));
            blk_t blk = page->f_pgoffset;   // NOTE assumes blksize = pagesize

            void *pagebuf;

419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
            /* mmap page temporarily somewhere
             *
             * ( we cannot use present page mapping in some vma directly,
             *   because while storeblk is called with virtmem lock released that
             *   mapping can go away ) */
            pagebuf = page_mmap(page, NULL, PROT_READ);
            TODO(!pagebuf); // XXX err

            /* unlock virtmem before calling storeblk()
             *
             * that call is potentially slow and external code can take other
             * locks. If that "other locks" are also taken before external code
             * calls e.g. fileh_invalidate_page() in different codepath a deadlock
             * can happen. (similar to loadblk case) */
            virt_unlock();
434 435 436

            err = file->file_ops->storeblk(file, blk, pagebuf);

437 438 439 440
            /* relock virtmem */
            virt_lock();

            xmunmap(pagebuf, page_size(page));
441 442 443 444 445

            if (err)
                goto out;
        }

446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
        /* wcfs:  remmap RW pages to base layer
         * !wcfs: page.state -> PAGE_LOADED and correct mappings RW -> R
         *
         * NOTE for transaction storage (ZODB and ZBigFile) storeblk creates
         * new transaction on database side, but does not update current DB
         * connection to view that transaction. Thus if loadblk will be loaded
         * with not-yet-resynced DB connection, it will return old - not stored
         * - data. For !wcfs case this is partly mitigated by the fact that
         * stored pages are kept as PAGE_LOADED in ram, but it cannot be
         * relied as ram_reclaim can drop those pages and read access to them
         * will trigger loadblk from database which will return old data.
         * For wcfs case remapping to base layer will always return old data
         * until wcfs mapping is updated to view database at newer state.
         *
         * In general it is a bug to access data pages in between transactions,
         * so we accept those corner case difference in between wcfs and !wcfs.
         */
463 464
        if (flags & WRITEOUT_MARKSTORED) {
            page->state = PAGE_LOADED;
465
            list_del_init(&page->in_dirty);
466 467 468

            list_for_each(hmmap, &fileh->mmaps) {
                VMA *vma = list_entry(hmmap, typeof(*vma), same_fileh);
469
                if (fileh->mmap_overlay) {
470 471 472 473 474 475
                    /* wcfs:  RW -> base layer */
                    vma_page_ensure_unmapped(vma, page);
                } else {
                    /* !wcfs: RW -> R*/
                    vma_page_ensure_notmappedrw(vma, page);
                }
476
            }
477

478 479 480 481 482 483 484 485 486
            /* wcfs:  all vmas are using base layer now - drop page completely
             *        without unnecessarily growing RSS and relying on reclaim.
             * !wcfs: keep the page in RAM cache, even if it is not mapped anywhere */
            if (fileh->mmap_overlay) {
                ASSERT(page->refcnt == 0);
                // XXX -> page_drop(page)
                pagemap_del(&fileh->pagemap, page->f_pgoffset);
                page_drop_memory(page);
                page_del(page);
487
            }
488 489 490 491
        }
    }


492 493
    /* if we successfully finished with markstored flag set - all dirty pages
     * should become non-dirty */
494
    if (flags & WRITEOUT_MARKSTORED)
495
        BUG_ON(!list_empty(&fileh->dirty_pages));
496

497
out:
498 499
    fileh->writeout_inprogress = 0;

500
    virt_unlock();
501
    sigsegv_restore(&save_sigset);
502 503 504 505 506 507 508
    return err;
}


void fileh_dirty_discard(BigFileH *fileh)
{
    Page *page;
509
    struct list_head *hpage, *hpage_next;
510 511 512
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
513
    virt_lock();
514

515 516
    /* discard is not allowed to run in parallel to writeout */
    BUG_ON(fileh->writeout_inprogress);
517

518 519 520 521
    list_for_each_safe(hpage, hpage_next, &fileh->dirty_pages) {
        page = list_entry(hpage, typeof(*page), in_dirty);
        BUG_ON(page->state != PAGE_DIRTY);

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
522 523 524 525 526 527 528 529 530
        // FIXME do pagemap_del + drop_ram + page_del unconditionally
        //       (just need think again and to update !wcfs discard test)
        if (fileh->mmap_overlay) {
            pagemap_del(&fileh->pagemap, page->f_pgoffset);
            page_drop_memory(page);
            page_del(page);
        } else {
            page_drop_memory(page);
        }
531 532 533
    }

    BUG_ON(!list_empty(&fileh->dirty_pages));
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551

    virt_unlock();
    sigsegv_restore(&save_sigset);
}


/****************
 * INVALIDATION *
 ****************/

void fileh_invalidate_page(BigFileH *fileh, pgoff_t pgoffset)
{
    Page *page;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

552 553
    /* it's an error to invalidate fileh while writeout is in progress */
    BUG_ON(fileh->writeout_inprogress);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
554 555
    // XXX wcfs: invalidate_page must not be called (wcfs handles invalidations itself)
    // XXX       or allow invalidate anyway (e.g. DIRTY -> base) ?
556
    // XXX yes -> allow invalidate for wcfs too - means forget in-ram page and mmap to base memory
557

558
    page = pagemap_get(&fileh->pagemap, pgoffset);
559 560 561 562 563 564 565 566 567 568 569 570
    if (page) {
        /* for pages where loading is in progress, we just remove the page from
         * pagemap and mark it to be dropped by their loaders after it is done.
         * In the mean time, as pagemap entry is now empty, on next access to
         * the memory the page will be created/loaded anew */
        if (page->state == PAGE_LOADING) {
            pagemap_del(&fileh->pagemap, pgoffset);
            page->state = PAGE_LOADING_INVALIDATED;
        }
        /* else we just make sure to drop page memory */
        else {
            page_drop_memory(page);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
571 572
            // XXX + page_del       ?
            // XXX + pagemap_del    ?
573 574
        }
    }
575

576
    virt_unlock();
577
    sigsegv_restore(&save_sigset);
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
}


/************************
 *  Lookup VMA by addr  *
 ************************/

/* list of all registered VMA(s) */
static LIST_HEAD(vma_list);

/* protects ^^^  XXX */
//spinlock_t vma_list_lock;



593 594
/* lookup VMA covering `addr`. NULL if not found
 * (should be called with virtmem lock held) */
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
VMA *virt_lookup_vma(void *addr)
{
    uintptr_t uaddr = (uintptr_t)addr;
    struct list_head *h;
    VMA *vma;

    list_for_each(h, &vma_list) {
        // XXX -> list_for_each_entry
        vma = list_entry(h, typeof(*vma), virt_list);
        if (uaddr < vma->addr_stop)
            /*
             * here:  vma->addr_start ? uaddr < vma->addr_stop
             * vma->addr_stop is first such addr_stop
             */
            return (vma->addr_start <= uaddr) ? vma : NULL;
    }

    return NULL;    /* not found at all or no overlap */
}


616 617
/* register VMA `vma` as covering some file view
 * (should be called with virtmem lock held) */
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
void virt_register_vma(VMA *vma)
{
    uintptr_t uaddr = vma->addr_start;
    struct list_head *h;
    struct VMA *a;

    list_for_each(h, &vma_list) {
        a = list_entry(h, typeof(*a), virt_list);
        if (uaddr < a->addr_stop)
            break;
    }

    /* either before found vma or, if not found, at the end of the list */
    list_add_tail(&vma->virt_list, h);
}


635 636
/* remove `area` from VMA registry. `area` must be registered before
 * (should be called with virtmem lock held) */
637 638 639 640 641 642 643 644
void virt_unregister_vma(VMA *vma)
{
    /* _init - to clear links, just in case */
    list_del_init(&vma->virt_list);
}


/*****************************************/
Kirill Smelkov's avatar
Kirill Smelkov committed
645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683

/*
 * allocate virtual memory address space
 * the pages are initially protected to prevent any access
 *
 * @addr    NULL - at anywhere,     !NULL - exactly there
 * @return  !NULL - mapped there    NULL - error
 */
void *mem_valloc(void *addr, size_t len)
{
    void *a;
    a = mmap(addr, len, PROT_NONE,
            MAP_PRIVATE | MAP_ANONYMOUS
            /* don't try to (pre-)allocate memory - just virtual address space */
            | MAP_NORESERVE
            | (addr ? MAP_FIXED : 0),
            -1, 0);

    if (a == MAP_FAILED)
        a = NULL;

    if (a && addr)
        /* verify OS respected our MAP_FIXED request */
        BUG_ON(a != addr);

    return a;
}


/* like mem_valloc() but allocation must not fail */
void *mem_xvalloc(void *addr, size_t len)
{
    void *a;
    a = mem_valloc(addr, len);
    BUG_ON(!a);
    return a;
}


684 685 686 687
/*********************
 * PAGEFAULT HANDLER *
 *********************/

688 689 690
/* pagefault entry when we know request came to our memory area
 *
 * (virtmem_lock already taken by caller)   */
691
VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
692 693 694 695
{
    pgoff_t pagen;
    Page *page;
    BigFileH *fileh;
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
696
    struct list_head *hmmap;
697 698 699 700 701 702 703

    /* continuing on_pagefault() - see (1) there ... */

    /* (2) vma, addr -> fileh, pagen    ;idx of fileh page covering addr */
    fileh = vma->fileh;
    pagen = vma_addr_fpgoffset(vma, addr);

704 705 706 707
    /* wcfs: we should get into SIGSEGV handler only on write access */
    if (fileh->mmap_overlay)
        BUG_ON(!write);

708 709 710
    /* (3) fileh, pagen -> page  (via pagemap) */
    page = pagemap_get(&fileh->pagemap, pagen);

711 712 713 714 715 716 717 718 719
    /* wcfs: all dirty pages are mmapped when vma is created.
     *       thus here, if page is present in pagemap, it can be only either
     *       - a page we just loaded for dirtying, or
     *       - a page that is in progress of being loaded.
     *
     * ( PAGE_LOADED_FOR_WRITE is used only to verify that in wcfs mode we
     *   always keep all dirty pages mmapped on fileh_open and so pagefault
     *   handler must not see a PAGE_LOADED page. )
     */
720
    if (fileh->mmap_overlay && page)
721
        ASSERT(page->state == PAGE_LOADED_FOR_WRITE || page->state == PAGE_LOADING);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
722

723 724 725 726 727 728 729
    /* (4) no page found - allocate new from ram */
    while (!page) {
        page = ramh_alloc_page(fileh->ramh, pagen);
        if (!page) {
            /* try to release some memory back to OS */
            // XXX do we need and how to distinguish "no ram page" vs "no memory for `struct page`"?
            //     -> no we don't -- better allocate memory for struct pages for whole RAM at ram setup
730
            if (!__ram_reclaim(fileh->ramh->ram))
731 732 733 734 735 736 737 738 739 740 741 742 743
                OOM();
            continue;
        }

        /* ramh set up .ramh, .ramh_pgoffset, .state?
         * now setup rest (link to fileh)  */
        page->fileh      = fileh;
        page->f_pgoffset = pagen;

        /* remember page in fileh->pagemap[pagen] */
        pagemap_set(&fileh->pagemap, pagen, page);
    }

744 745
    /* (5a) if page was not yet loaded - start loading it */
    if (page->state == PAGE_EMPTY) {
746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
        /* NOTE if we load data in-place, there would be a race with concurrent
         * access to the page here - after first enabling memory-access to
         * the page, other threads could end up reading corrupt data, while
         * loading had not finished.
         *
         * so to avoid it we first load data to separate memory address, then
         * mmap-duplicate that page into here, but it is more work compared to
         * what kernel internally does.
         *
         * TODO try to use remap_anon_pages() when it is ready
         *      (but unfortunately it is only for anonymous memory)
         * NOTE remap_file_pages() is going away...
         */
        blk_t blk;
        void *pageram;
        int err;
762
        BigFile *file;
763 764 765 766 767 768

        /*
         * if pagesize < blksize - need to prepare several adjacent pages for blk;
         * if pagesize > blksize - will need to either 1) rescan which blk got
         *    dirty, or 2) store not-even-touched blocks adjacent to modified one.
         */
769 770
        file = fileh->file;
        TODO (file->blksize != page_size(page));
771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790

        // FIXME doing this mmap-to-temp/unmap is somewhat costly. Better
        // constantly have whole RAM mapping somewhere R/W and load there.
        // (XXX but then we'll either have
        //    - VMA fragmented  (if we manage whole RAM as 1 file of physram size),
        //    - or need to waste a lot of address space (size of each ramh can be very large)
        //
        //    generally this way it also has major problems)
        //
        // Also this way, we btw don't need to require python code to drop all
        // references to loading buf.

        /* mmap page memory temporarily somewhere
         * XXX better pre-map all ram pages r/w in another area to not need to mmap/unmap it here
         *     -> will run slightly faster (but major slowdown is in clear_page in kernel)
         */
        // TODO MAP_UNINITIALIZED somehow? (we'll overwrite that memory)
        pageram = page_mmap(page, NULL, PROT_READ | PROT_WRITE);
        TODO(!pageram); // XXX err

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
791
        /* load block -> pageram memory */
792
        blk = page->f_pgoffset;     // NOTE because blksize = pagesize
793 794 795 796 797 798

        /* mark page as loading and unlock virtmem before calling loadblk()
         *
         * that call is potentially slow and external code can take other
         * locks. If that "other locks" are also taken before external code
         * calls e.g. fileh_invalidate_page() in different codepath a deadlock
799
         * can happen. (similar to storeblk case) */
800 801 802
        page->state = PAGE_LOADING;
        virt_unlock();

803
        if (fileh->mmap_overlay) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
804 805 806 807 808 809 810
            /* wcfs: copy block data from read-only base mmap.
             * NOTE we'll get SIGBUG here if wcfs returns EIO when loading block data */
            memcpy(pageram, vma_page_addr(vma, page), page_size(page));
        }
        else {
            /* !wcfs: call loadblk */
            err = file->file_ops->loadblk(file, blk, pageram);
811

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
812 813 814 815 816 817 818 819
            /* TODO on error -> try to throw exception somehow to the caller, so
             *      that it can abort current transaction, but not die.
             *
             * NOTE for analogue situation when read for mmaped file fails, the
             *      kernel sends SIGBUS
             */
            TODO (err);
        }
820

821 822 823
        /* relock virtmem */
        virt_lock();

824 825
        xmunmap(pageram, page_size(page));

826 827 828 829 830
        /* if the page was invalidated while we were loading it, we have to drop
         * it's memory and the Page structure completely - invalidater already
         * removed it from pagemap */
        if (page->state == PAGE_LOADING_INVALIDATED) {
            page_drop_memory(page);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
831
            page_del(page);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
832
            // XXX + pagemap_del    ?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
833 834 835
//            list_del(&page->lru);
//            bzero(page, sizeof(*page)); /* just in case */
//            free(page);
836 837 838 839
        }

        /* else just mark the page as loaded ok */
        else
840
            page->state = (write ? PAGE_LOADED_FOR_WRITE : PAGE_LOADED);
841 842 843 844 845 846 847 848

        /* we have to retry the whole fault, because the vma could have been
         * changed while we were loading page with virtmem lock released */
        return VM_RETRY;
    }

    /* (5b) page is currently being loaded by another thread - wait for load to complete
     *
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
849
     * NOTE a page is protected from being concurrently loaded by two threads at
850 851 852 853 854 855 856 857 858 859 860 861 862 863
     * the same time via:
     *
     *   - virtmem lock - we get/put pages from fileh->pagemap only under it
     *   - page->state is set PAGE_LOADING for loading in progress pages
     *   - such page is inserted in fileh->pagepam
     *
     * so if second thread faults at the same memory page, and the page is
     * still loading, it will find the page in PAGE_LOADING state and will just
     * wait for it to complete. */
    if (page->state == PAGE_LOADING) {
        /* XXX polling instead of proper completion */
        void *gilstate;
        virt_unlock();
        gilstate = virt_gil_ensure_unlocked();
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
864
        usleep(10000);  // XXX with 1000 usleep still busywaits
865 866 867
        virt_gil_retake_if_waslocked(gilstate);
        virt_lock();
        return VM_RETRY;
868 869 870 871
    }

    /* (6) page data ready. Mmap it atomically into vma address space, or mprotect
     * appropriately if it was already mmaped. */
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
872 873
    PageState newstate = PAGE_LOADED;   // XXX vvv PAGE_LOADED_FOR_WRITE ok?
    if (write || page->state == PAGE_DIRTY || page->state == PAGE_LOADED_FOR_WRITE) {
874 875 876 877
        newstate = PAGE_DIRTY;
    }

    // XXX also call page->markdirty() ?
878 879 880 881
    if (newstate == PAGE_DIRTY  &&  newstate != page->state) {
        /* it is not allowed to modify pages while writeout is in progress */
        BUG_ON(fileh->writeout_inprogress);

882
        list_add_tail(&page->in_dirty, &fileh->dirty_pages);
883
    }
884 885
    page->state = max(page->state, newstate);

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
886 887 888 889
    // XXX overlay: assert !vma->page_ismappedv[blk]    XXX not ok? (retrying after virt unlock/lock)
    // XXX mmap page to all vma with .mmap_overlay=1 of this fileh.

    vma_mmap_page(vma, page);
890
    /* wcfs: mmap the page to all wcfs-backed vmas. If we don't, the memory on
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
891
     * those vmas will read with stale data */
892 893 894 895 896 897
    if (fileh->mmap_overlay) {
        list_for_each(hmmap, &fileh->mmaps) {
            VMA *vma2 = list_entry(hmmap, typeof(*vma2), same_fileh);
            if (vma2 != vma)
                vma_mmap_page(vma2, page);
        }
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
898
    }
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
899

900 901 902 903 904 905 906 907 908
    /* mark page as used recently */
    // XXX = list_move_tail()
    list_del(&page->lru);
    list_add_tail(&page->lru, &page->ramh->ram->lru_list);

    /*
     * (7) access to page prepared - now it is ok to return from signal handler
     *     - the caller will re-try executing faulting instruction.
     */
909
    return VM_HANDLED;
910 911 912 913 914 915 916 917
}


/***********
 * RECLAIM *
 ***********/

#define RECLAIM_BATCH   64      /* how many pages to reclaim at once */
918
static int __ram_reclaim(RAM *ram)
919 920 921 922 923 924 925 926 927 928 929 930 931 932
{
    struct list_head *lru_list = &ram->lru_list;
    struct list_head *hlru;
    Page *page;
    int batch = RECLAIM_BATCH, scanned = 0;

    TRACE("RAM_RECLAIM\n");
    hlru = lru_list->next;

    while (batch && hlru != lru_list) {
        page = list_entry(hlru, typeof(*page), lru);
        hlru = hlru->next;
        scanned++;

933 934
        /* can release ram only from loaded non-dirty pages
         * NOTE PAGE_LOADING pages are not dropped - they just continue to load */
935
        if (page->state == PAGE_LOADED || page->state == PAGE_LOADED_FOR_WRITE) {
936 937 938 939 940
            page_drop_memory(page);
            batch--;
        }

        /* PAGE_EMPTY pages without mappers go away */
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
941
        // XXX merge vvv with ^^^ : page_drop_memory + pagemap_del + page_del
942 943 944 945 946
        if (page->state == PAGE_EMPTY) {
            BUG_ON(page->refcnt != 0);  // XXX what for then we have refcnt? -> vs discard

            /* delete page & its entry in fileh->pagemap */
            pagemap_del(&page->fileh->pagemap, page->f_pgoffset);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
947 948 949 950
            page_del(page);
//            list_del(&page->lru);
//            bzero(page, sizeof(*page)); /* just in case */
//            free(page);
951 952 953 954 955 956 957 958 959
        }

    }

    TRACE("\t-> reclaimed %i  scanned %i\n", RECLAIM_BATCH - batch, scanned);
    return RECLAIM_BATCH - batch;
}


960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
int ram_reclaim(RAM *ram)
{
    int ret;
    sigset_t save_sigset;

    sigsegv_block(&save_sigset);
    virt_lock();

    ret = __ram_reclaim(ram);

    virt_unlock();
    sigsegv_restore(&save_sigset);
    return ret;
}

975

Kirill Smelkov's avatar
Kirill Smelkov committed
976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
/********************
 * Internal helpers *
 ********************/

static size_t page_size(const Page *page)
{
    return page->ramh->ram->pagesize;
}

void page_incref(Page *page)
{
    page->refcnt++;     // XXX atomically ?
}

void page_decref(Page *page)
{
    page->refcnt--;     // XXX atomically ?
    BUG_ON(page->refcnt < 0);

    // TODO if unused delete self && clear pagemap ?
    // XXX  if dirty -> delete = not ok
}

void *page_mmap(Page *page, void *addr, int prot)
{
    RAMH *ramh = page->ramh;
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1002
    // XXX better call ramh_mmap_page() without tinkering with ramh_ops?
Kirill Smelkov's avatar
Kirill Smelkov committed
1003 1004
    return ramh->ramh_ops->mmap_page(ramh, page->ramh_pgoffset, addr, prot);
}
1005 1006


Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1007
// XXX -> page_drop = drop memory, delete page from pagemap, delete page
1008 1009 1010 1011 1012
static void page_drop_memory(Page *page)
{
    /* Memory for this page goes out. 1) unmap it from all mmaps */
    struct list_head *hmmap;

1013 1014 1015
    /* NOTE we try not to drop memory for loading-in-progress pages.
     *      so if this is called for such a page - it is a bug. */
    BUG_ON(page->state == PAGE_LOADING);
1016 1017
    /* same for storing-in-progress */
    BUG_ON(page->fileh->writeout_inprogress && page->state == PAGE_DIRTY);
1018

1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
    if (page->state == PAGE_EMPTY)
        return;

    list_for_each(hmmap, &page->fileh->mmaps) {
        VMA *vma = list_entry(hmmap, typeof(*vma), same_fileh);
        vma_page_ensure_unmapped(vma, page);
    }

    /* 2) release memory to ram */
    ramh_drop_memory(page->ramh, page->ramh_pgoffset);
1029
    if (page->state == PAGE_DIRTY)
1030
        list_del_init(&page->in_dirty);
1031 1032 1033 1034 1035
    page->state = PAGE_EMPTY;

    // XXX touch lru?
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1036 1037 1038 1039
/* page_del deletes Page struct (but not page memory - see page_drop_memory).
 *
 * The page is removed from ram->lru.
 */
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1040
static void page_del(Page *page) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1041 1042 1043
    BUG_ON(page->refcnt != 0);
    BUG_ON(page->state == PAGE_DIRTY);  // XXX + PAGE_LOADING ?

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1044 1045 1046 1047 1048
    list_del(&page->lru);
    bzero(page, sizeof(*page)); /* just in case */
    free(page);
}

1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070


/* vma: page -> addr  where it should-be mmaped in vma */
static void *vma_page_addr(VMA *vma, Page *page)
{
    uintptr_t addr;
    ASSERT(vma->fileh == page->fileh);      // XXX needed here?

    addr = vma->addr_start + (page->f_pgoffset - vma->f_pgoffset) * page_size(page);
    ASSERT(vma->addr_start <= addr  &&
                              addr < vma->addr_stop);
    return (void *)addr;
}


/* vma: addr -> fileh pgoffset  with page containing addr */
static pgoff_t vma_addr_fpgoffset(VMA *vma, uintptr_t addr)
{
    return vma->f_pgoffset + (addr - vma->addr_start) / vma->fileh->ramh->ram->pagesize;
}


Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098
/* vma_mmap_page mmaps page into vma.
 *
 * the page must belong to covered file.
 * mmap protection is PROT_READ if page is PAGE_LOADED or PROT_READ|PROT_WRITE
 * if page is PAGE_DIRTY.
 *
 * must be called under virtmem lock.
 */
static void vma_mmap_page(VMA *vma, Page *page) {
    pgoff_t pgoff_invma;
    int prot = (page->state == PAGE_DIRTY ? PROT_READ|PROT_WRITE : PROT_READ);

    ASSERT(page->state == PAGE_LOADED || page->state == PAGE_DIRTY);
    ASSERT(vma->f_pgoffset <= page->f_pgoffset &&
                              page->f_pgoffset < vma_addr_fpgoffset(vma, vma->addr_stop));

    pgoff_invma = page->f_pgoffset - vma->f_pgoffset;
    if (!bitmap_test_bit(vma->page_ismappedv, pgoff_invma)) {
        // XXX err
        page_mmap(page, vma_page_addr(vma, page), prot);
        bitmap_set_bit(vma->page_ismappedv, pgoff_invma);
        page_incref(page);
    }
    else {
        /* just changing protection bits should not fail, if parameters ok */
        xmprotect(vma_page_addr(vma, page), page_size(page), prot);
    }
}
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120

/* is `page` mapped to `vma` */
static int vma_page_ismapped(VMA *vma, Page *page)
{
    pgoff_t vma_fpgstop;
    ASSERT(vma->fileh == page->fileh);

    vma_fpgstop = vma_addr_fpgoffset(vma, vma->addr_stop);
    if (!(vma->f_pgoffset <= page->f_pgoffset  &&
                             page->f_pgoffset < vma_fpgstop))
        return 0;

    return bitmap_test_bit(vma->page_ismappedv, page->f_pgoffset - vma->f_pgoffset);
}


/* ensure `page` is not mapped to `vma` */
static void vma_page_ensure_unmapped(VMA *vma, Page *page)
{
    if (!vma_page_ismapped(vma, page))
        return;

1121
    if (vma->fileh->mmap_overlay) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1122 1123 1124 1125 1126
        /* wcfs: remmap readonly to base image */
        BigFile *file = vma->fileh->file;
        int err;

        TODO (file->blksize != page_size(page));
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1127
        err = file->file_ops->remmap_blk_read(file, /* blk = */page->f_pgoffset, vma);
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1128 1129 1130 1131 1132 1133
        BUG_ON(err); /* must not fail */
    }
    else {
        /* !wcfs: mmap empty PROT_NONE address space instead of page memory */
        mem_xvalloc(vma_page_addr(vma, page), page_size(page));
    }
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161

    bitmap_clear_bit(vma->page_ismappedv, page->f_pgoffset - vma->f_pgoffset);
    page_decref(page);
}


/* ensure `page` is not mapped RW to `vma`
 *
 * if mapped -> should be mapped as R
 * if not mapped - leave as is
 */
static void vma_page_ensure_notmappedrw(VMA *vma, Page *page)
{
    if (!vma_page_ismapped(vma, page))
        return;

    /* just changing protection - should not fail */
    // XXX PROT_READ always? (it could be mmaped with PROT_NONE before without
    // first access) - then it should not be mapped in page_ismappedv -> ok.
    xmprotect(vma_page_addr(vma, page), page_size(page), PROT_READ);
}


// XXX stub
void OOM(void)
{
    BUG();
}