filemap.c 125 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12
/*
 *	linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
13
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
14
#include <linux/compiler.h>
15
#include <linux/dax.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
#include <linux/fs.h>
17
#include <linux/sched/signal.h>
18
#include <linux/uaccess.h>
19
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
20
#include <linux/kernel_stat.h>
21
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
22 23
#include <linux/mm.h>
#include <linux/swap.h>
24
#include <linux/swapops.h>
25
#include <linux/syscalls.h>
Linus Torvalds's avatar
Linus Torvalds committed
26 27 28 29
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
30
#include <linux/error-injection.h>
Linus Torvalds's avatar
Linus Torvalds committed
31 32
#include <linux/hash.h>
#include <linux/writeback.h>
33
#include <linux/backing-dev.h>
Linus Torvalds's avatar
Linus Torvalds committed
34 35
#include <linux/pagevec.h>
#include <linux/security.h>
36
#include <linux/cpuset.h>
37
#include <linux/hugetlb.h>
38
#include <linux/memcontrol.h>
39
#include <linux/shmem_fs.h>
40
#include <linux/rmap.h>
41
#include <linux/delayacct.h>
42
#include <linux/psi.h>
43
#include <linux/ramfs.h>
44
#include <linux/page_idle.h>
45
#include <linux/migrate.h>
46 47
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
48
#include <linux/rcupdate_wait.h>
49
#include <asm/pgalloc.h>
50
#include <asm/tlbflush.h>
51 52
#include "internal.h"

53 54 55
#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

Linus Torvalds's avatar
Linus Torvalds committed
56 57 58
/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
59
#include <linux/buffer_head.h> /* for try_to_free_buffers */
Linus Torvalds's avatar
Linus Torvalds committed
60 61 62

#include <asm/mman.h>

63 64
#include "swap.h"

Linus Torvalds's avatar
Linus Torvalds committed
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
80
 *  ->i_mmap_rwsem		(truncate_pagecache)
81
 *    ->private_lock		(__free_pte->block_dirty_folio)
82
 *      ->swap_lock		(exclusive_swap_page, others)
Matthew Wilcox's avatar
Matthew Wilcox committed
83
 *        ->i_pages lock
Linus Torvalds's avatar
Linus Torvalds committed
84
 *
85
 *  ->i_rwsem
86 87
 *    ->invalidate_lock		(acquired by fs in truncate path)
 *      ->i_mmap_rwsem		(truncate->unmap_mapping_range)
Linus Torvalds's avatar
Linus Torvalds committed
88
 *
89
 *  ->mmap_lock
90
 *    ->i_mmap_rwsem
91
 *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
Matthew Wilcox's avatar
Matthew Wilcox committed
92
 *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
Linus Torvalds's avatar
Linus Torvalds committed
93
 *
94
 *  ->mmap_lock
95 96
 *    ->invalidate_lock		(filemap_fault)
 *      ->lock_page		(filemap_fault, access_process_vm)
Linus Torvalds's avatar
Linus Torvalds committed
97
 *
98
 *  ->i_rwsem			(generic_perform_write)
99
 *    ->mmap_lock		(fault_in_readable->do_page_fault)
Linus Torvalds's avatar
Linus Torvalds committed
100
 *
101
 *  bdi->wb.list_lock
102
 *    sb_lock			(fs/fs-writeback.c)
Matthew Wilcox's avatar
Matthew Wilcox committed
103
 *    ->i_pages lock		(__sync_single_inode)
Linus Torvalds's avatar
Linus Torvalds committed
104
 *
105
 *  ->i_mmap_rwsem
106
 *    ->anon_vma.lock		(vma_merge)
Linus Torvalds's avatar
Linus Torvalds committed
107 108
 *
 *  ->anon_vma.lock
109
 *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
Linus Torvalds's avatar
Linus Torvalds committed
110
 *
111
 *  ->page_table_lock or pte_lock
112
 *    ->swap_lock		(try_to_unmap_one)
Linus Torvalds's avatar
Linus Torvalds committed
113
 *    ->private_lock		(try_to_unmap_one)
Matthew Wilcox's avatar
Matthew Wilcox committed
114
 *    ->i_pages lock		(try_to_unmap_one)
115 116
 *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
 *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
117 118 119 120 121
 *    ->private_lock		(folio_remove_rmap_pte->set_page_dirty)
 *    ->i_pages lock		(folio_remove_rmap_pte->set_page_dirty)
 *    bdi.wb->list_lock		(folio_remove_rmap_pte->set_page_dirty)
 *    ->inode->i_lock		(folio_remove_rmap_pte->set_page_dirty)
 *    ->memcg->move_lock	(folio_remove_rmap_pte->folio_memcg_lock)
122
 *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
123
 *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
124
 *    ->private_lock		(zap_pte_range->block_dirty_folio)
Linus Torvalds's avatar
Linus Torvalds committed
125 126
 */

127 128 129 130 131 132 133 134 135
static void mapping_set_update(struct xa_state *xas,
		struct address_space *mapping)
{
	if (dax_mapping(mapping) || shmem_mapping(mapping))
		return;
	xas_set_update(xas, workingset_update_node);
	xas_set_lru(xas, &shadow_nodes);
}

136
static void page_cache_delete(struct address_space *mapping,
137
				   struct folio *folio, void *shadow)
138
{
139 140
	XA_STATE(xas, &mapping->i_pages, folio->index);
	long nr = 1;
141

142
	mapping_set_update(&xas, mapping);
143

144 145
	xas_set_order(&xas, folio->index, folio_order(folio));
	nr = folio_nr_pages(folio);
146

147
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
148

149 150
	xas_store(&xas, shadow);
	xas_init_marks(&xas);
151

152
	folio->mapping = NULL;
153
	/* Leave page->index set: truncation lookup relies upon it */
154
	mapping->nrpages -= nr;
155 156
}

157 158
static void filemap_unaccount_folio(struct address_space *mapping,
		struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
159
{
160
	long nr;
Linus Torvalds's avatar
Linus Torvalds committed
161

162 163
	VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
164
		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
165 166
			 current->comm, folio_pfn(folio));
		dump_page(&folio->page, "still mapped when deleted");
167 168 169
		dump_stack();
		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

170
		if (mapping_exiting(mapping) && !folio_test_large(folio)) {
171
			int mapcount = folio_mapcount(folio);
172 173 174 175 176 177 178 179

			if (folio_ref_count(folio) >= mapcount + 2) {
				/*
				 * All vmas have already been torn down, so it's
				 * a good bet that actually the page is unmapped
				 * and we'd rather not leak it: if we're wrong,
				 * another bad page check should catch it later.
				 */
180
				atomic_set(&folio->_mapcount, -1);
181 182
				folio_ref_sub(folio, mapcount);
			}
183 184 185
		}
	}

186 187
	/* hugetlb folios do not participate in page cache accounting. */
	if (folio_test_hugetlb(folio))
188
		return;
189

190
	nr = folio_nr_pages(folio);
191

192 193 194 195 196 197 198
	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
	if (folio_test_swapbacked(folio)) {
		__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
		if (folio_test_pmd_mappable(folio))
			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
	} else if (folio_test_pmd_mappable(folio)) {
		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
199
		filemap_nr_thps_dec(mapping);
200
	}
201 202

	/*
203 204
	 * At this point folio must be either written or cleaned by
	 * truncate.  Dirty folio here signals a bug and loss of
205
	 * unwritten data - on ordinary filesystems.
206
	 *
207 208 209 210 211
	 * But it's harmless on in-memory filesystems like tmpfs; and can
	 * occur when a driver which did get_user_pages() sets page dirty
	 * before putting it, while the inode is being finally evicted.
	 *
	 * Below fixes dirty accounting after removing the folio entirely
212 213
	 * but leaves the dirty flag set: it has no effect for truncated
	 * folio and anyway will be cleared before returning folio to
214 215
	 * buddy allocator.
	 */
216 217 218
	if (WARN_ON_ONCE(folio_test_dirty(folio) &&
			 mapping_can_writeback(mapping)))
		folio_account_cleaned(folio, inode_to_wb(mapping->host));
219 220 221 222 223
}

/*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
Matthew Wilcox's avatar
Matthew Wilcox committed
224
 * is safe.  The caller must hold the i_pages lock.
225
 */
226
void __filemap_remove_folio(struct folio *folio, void *shadow)
227
{
228
	struct address_space *mapping = folio->mapping;
229

230
	trace_mm_filemap_delete_from_page_cache(folio);
231
	filemap_unaccount_folio(mapping, folio);
232
	page_cache_delete(mapping, folio, shadow);
Linus Torvalds's avatar
Linus Torvalds committed
233 234
}

235
void filemap_free_folio(struct address_space *mapping, struct folio *folio)
236
{
237
	void (*free_folio)(struct folio *);
238
	int refs = 1;
239

240 241 242
	free_folio = mapping->a_ops->free_folio;
	if (free_folio)
		free_folio(folio);
243

244
	if (folio_test_large(folio))
245 246
		refs = folio_nr_pages(folio);
	folio_put_refs(folio, refs);
247 248
}

249
/**
250 251
 * filemap_remove_folio - Remove folio from page cache.
 * @folio: The folio.
252
 *
253 254 255
 * This must be called only on folios that are locked and have been
 * verified to be in the page cache.  It will never put the folio into
 * the free list because the caller has a reference on the page.
256
 */
257
void filemap_remove_folio(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
258
{
259
	struct address_space *mapping = folio->mapping;
Linus Torvalds's avatar
Linus Torvalds committed
260

261
	BUG_ON(!folio_test_locked(folio));
262
	spin_lock(&mapping->host->i_lock);
263
	xa_lock_irq(&mapping->i_pages);
264
	__filemap_remove_folio(folio, NULL);
265
	xa_unlock_irq(&mapping->i_pages);
266 267 268
	if (mapping_shrinkable(mapping))
		inode_add_lru(mapping->host);
	spin_unlock(&mapping->host->i_lock);
269

270
	filemap_free_folio(mapping, folio);
271 272
}

273
/*
274 275 276
 * page_cache_delete_batch - delete several folios from page cache
 * @mapping: the mapping to which folios belong
 * @fbatch: batch of folios to delete
277
 *
278 279 280 281 282
 * The function walks over mapping->i_pages and removes folios passed in
 * @fbatch from the mapping. The function expects @fbatch to be sorted
 * by page index and is optimised for it to be dense.
 * It tolerates holes in @fbatch (mapping entries at those indices are not
 * modified).
283
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
284
 * The function expects the i_pages lock to be held.
285
 */
286
static void page_cache_delete_batch(struct address_space *mapping,
287
			     struct folio_batch *fbatch)
288
{
289
	XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
290
	long total_pages = 0;
291
	int i = 0;
292
	struct folio *folio;
293

294
	mapping_set_update(&xas, mapping);
295
	xas_for_each(&xas, folio, ULONG_MAX) {
296
		if (i >= folio_batch_count(fbatch))
297
			break;
298 299

		/* A swap/dax/shadow entry got inserted? Skip it. */
300
		if (xa_is_value(folio))
301
			continue;
302 303 304 305 306 307 308
		/*
		 * A page got inserted in our range? Skip it. We have our
		 * pages locked so they are protected from being removed.
		 * If we see a page whose index is higher than ours, it
		 * means our page has been removed, which shouldn't be
		 * possible because we're holding the PageLock.
		 */
309
		if (folio != fbatch->folios[i]) {
310
			VM_BUG_ON_FOLIO(folio->index >
311
					fbatch->folios[i]->index, folio);
312 313 314
			continue;
		}

315
		WARN_ON_ONCE(!folio_test_locked(folio));
316

317
		folio->mapping = NULL;
318
		/* Leave folio->index set: truncation lookup relies on it */
319

320
		i++;
321
		xas_store(&xas, NULL);
322
		total_pages += folio_nr_pages(folio);
323 324 325 326 327
	}
	mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
328
				  struct folio_batch *fbatch)
329 330 331
{
	int i;

332
	if (!folio_batch_count(fbatch))
333 334
		return;

335
	spin_lock(&mapping->host->i_lock);
336
	xa_lock_irq(&mapping->i_pages);
337 338
	for (i = 0; i < folio_batch_count(fbatch); i++) {
		struct folio *folio = fbatch->folios[i];
339

340 341
		trace_mm_filemap_delete_from_page_cache(folio);
		filemap_unaccount_folio(mapping, folio);
342
	}
343
	page_cache_delete_batch(mapping, fbatch);
344
	xa_unlock_irq(&mapping->i_pages);
345 346 347
	if (mapping_shrinkable(mapping))
		inode_add_lru(mapping->host);
	spin_unlock(&mapping->host->i_lock);
348

349 350
	for (i = 0; i < folio_batch_count(fbatch); i++)
		filemap_free_folio(mapping, fbatch->folios[i]);
351 352
}

353
int filemap_check_errors(struct address_space *mapping)
354 355 356
{
	int ret = 0;
	/* Check for outstanding write errors */
357 358
	if (test_bit(AS_ENOSPC, &mapping->flags) &&
	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
359
		ret = -ENOSPC;
360 361
	if (test_bit(AS_EIO, &mapping->flags) &&
	    test_and_clear_bit(AS_EIO, &mapping->flags))
362 363 364
		ret = -EIO;
	return ret;
}
365
EXPORT_SYMBOL(filemap_check_errors);
366

367 368 369 370 371 372 373 374 375 376
static int filemap_check_and_keep_errors(struct address_space *mapping)
{
	/* Check for outstanding write errors */
	if (test_bit(AS_EIO, &mapping->flags))
		return -EIO;
	if (test_bit(AS_ENOSPC, &mapping->flags))
		return -ENOSPC;
	return 0;
}

377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
/**
 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
 * @mapping:	address space structure to write
 * @wbc:	the writeback_control controlling the writeout
 *
 * Call writepages on the mapping using the provided wbc to control the
 * writeout.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_fdatawrite_wbc(struct address_space *mapping,
			   struct writeback_control *wbc)
{
	int ret;

	if (!mapping_can_writeback(mapping) ||
	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
		return 0;

	wbc_attach_fdatawrite_inode(wbc, mapping->host);
	ret = do_writepages(mapping, wbc);
	wbc_detach_inode(wbc);
	return ret;
}
EXPORT_SYMBOL(filemap_fdatawrite_wbc);

Linus Torvalds's avatar
Linus Torvalds committed
403
/**
404
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
405 406
 * @mapping:	address space structure to write
 * @start:	offset in bytes where the range starts
407
 * @end:	offset in bytes where the range ends (inclusive)
408
 * @sync_mode:	enable synchronous operation
Linus Torvalds's avatar
Linus Torvalds committed
409
 *
410 411 412
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
Linus Torvalds's avatar
Linus Torvalds committed
413
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
414
 * opposed to a regular memory cleansing writeback.  The difference between
Linus Torvalds's avatar
Linus Torvalds committed
415 416
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
417 418
 *
 * Return: %0 on success, negative error code otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
419
 */
420 421
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
				loff_t end, int sync_mode)
Linus Torvalds's avatar
Linus Torvalds committed
422 423 424
{
	struct writeback_control wbc = {
		.sync_mode = sync_mode,
425
		.nr_to_write = LONG_MAX,
426 427
		.range_start = start,
		.range_end = end,
Linus Torvalds's avatar
Linus Torvalds committed
428 429
	};

430
	return filemap_fdatawrite_wbc(mapping, &wbc);
Linus Torvalds's avatar
Linus Torvalds committed
431 432 433 434 435
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
	int sync_mode)
{
436
	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
Linus Torvalds's avatar
Linus Torvalds committed
437 438 439 440 441 442 443 444
}

int filemap_fdatawrite(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

445
int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
446
				loff_t end)
Linus Torvalds's avatar
Linus Torvalds committed
447 448 449
{
	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
450
EXPORT_SYMBOL(filemap_fdatawrite_range);
Linus Torvalds's avatar
Linus Torvalds committed
451

452 453 454 455
/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:	target address_space
 *
Linus Torvalds's avatar
Linus Torvalds committed
456 457
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
458 459
 *
 * Return: %0 on success, negative error code otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
460 461 462 463 464 465 466
 */
int filemap_flush(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

467 468 469 470 471 472 473 474
/**
 * filemap_range_has_page - check if a page exists in range.
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback.
475 476 477
 *
 * Return: %true if at least one page exists in the specified range,
 * %false otherwise.
478 479 480 481
 */
bool filemap_range_has_page(struct address_space *mapping,
			   loff_t start_byte, loff_t end_byte)
{
482
	struct folio *folio;
483 484
	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
	pgoff_t max = end_byte >> PAGE_SHIFT;
485 486 487 488

	if (end_byte < start_byte)
		return false;

489 490
	rcu_read_lock();
	for (;;) {
491 492
		folio = xas_find(&xas, max);
		if (xas_retry(&xas, folio))
493 494
			continue;
		/* Shadow entries don't count */
495
		if (xa_is_value(folio))
496 497 498 499 500 501 502 503 504
			continue;
		/*
		 * We don't need to try to pin this page; we're about to
		 * release the RCU lock anyway.  It is enough to know that
		 * there was a page here recently.
		 */
		break;
	}
	rcu_read_unlock();
505

506
	return folio != NULL;
507 508 509
}
EXPORT_SYMBOL(filemap_range_has_page);

510
static void __filemap_fdatawait_range(struct address_space *mapping,
511
				     loff_t start_byte, loff_t end_byte)
Linus Torvalds's avatar
Linus Torvalds committed
512
{
513 514
	pgoff_t index = start_byte >> PAGE_SHIFT;
	pgoff_t end = end_byte >> PAGE_SHIFT;
515 516 517 518
	struct folio_batch fbatch;
	unsigned nr_folios;

	folio_batch_init(&fbatch);
Linus Torvalds's avatar
Linus Torvalds committed
519

520
	while (index <= end) {
Linus Torvalds's avatar
Linus Torvalds committed
521 522
		unsigned i;

523 524 525 526
		nr_folios = filemap_get_folios_tag(mapping, &index, end,
				PAGECACHE_TAG_WRITEBACK, &fbatch);

		if (!nr_folios)
527 528
			break;

529 530
		for (i = 0; i < nr_folios; i++) {
			struct folio *folio = fbatch.folios[i];
Linus Torvalds's avatar
Linus Torvalds committed
531

532 533
			folio_wait_writeback(folio);
			folio_clear_error(folio);
Linus Torvalds's avatar
Linus Torvalds committed
534
		}
535
		folio_batch_release(&fbatch);
Linus Torvalds's avatar
Linus Torvalds committed
536 537
		cond_resched();
	}
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
}

/**
 * filemap_fdatawait_range - wait for writeback to complete
 * @mapping:		address space structure to wait for
 * @start_byte:		offset in bytes where the range starts
 * @end_byte:		offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space
 * in the given range and wait for all of them.  Check error status of
 * the address space and return it.
 *
 * Since the error status of the address space is cleared by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
553 554
 *
 * Return: error status of the address space.
555 556 557 558
 */
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
559 560
	__filemap_fdatawait_range(mapping, start_byte, end_byte);
	return filemap_check_errors(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
561
}
562 563
EXPORT_SYMBOL(filemap_fdatawait_range);

564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
/**
 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 * @mapping:		address space structure to wait for
 * @start_byte:		offset in bytes where the range starts
 * @end_byte:		offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space in the
 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 * this function does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 */
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
		loff_t start_byte, loff_t end_byte)
{
	__filemap_fdatawait_range(mapping, start_byte, end_byte);
	return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

586 587 588 589 590 591 592 593 594 595 596 597 598
/**
 * file_fdatawait_range - wait for writeback to complete
 * @file:		file pointing to address space structure to wait for
 * @start_byte:		offset in bytes where the range starts
 * @end_byte:		offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the address space that file
 * refers to, in the given range and wait for all of them.  Check error
 * status of the address space vs. the file->f_wb_err cursor and return it.
 *
 * Since the error status of the file is advanced by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
599 600
 *
 * Return: error status of the address space vs. the file->f_wb_err cursor.
601 602 603 604 605 606 607 608 609
 */
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
	struct address_space *mapping = file->f_mapping;

	__filemap_fdatawait_range(mapping, start_byte, end_byte);
	return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);
610

611 612 613 614 615 616 617 618 619 620 621
/**
 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 * @mapping: address space structure to wait for
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.  Unlike filemap_fdatawait(), this function
 * does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
622 623
 *
 * Return: error status of the address space.
624
 */
625
int filemap_fdatawait_keep_errors(struct address_space *mapping)
626
{
627
	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
628
	return filemap_check_and_keep_errors(mapping);
629
}
630
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
631

632
/* Returns true if writeback might be needed or already in progress. */
633
static bool mapping_needs_writeback(struct address_space *mapping)
Linus Torvalds's avatar
Linus Torvalds committed
634
{
635
	return mapping->nrpages;
Linus Torvalds's avatar
Linus Torvalds committed
636 637
}

638 639
bool filemap_range_has_writeback(struct address_space *mapping,
				 loff_t start_byte, loff_t end_byte)
640 641 642
{
	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
	pgoff_t max = end_byte >> PAGE_SHIFT;
643
	struct folio *folio;
644 645 646 647 648

	if (end_byte < start_byte)
		return false;

	rcu_read_lock();
649 650
	xas_for_each(&xas, folio, max) {
		if (xas_retry(&xas, folio))
651
			continue;
652
		if (xa_is_value(folio))
653
			continue;
654 655
		if (folio_test_dirty(folio) || folio_test_locked(folio) ||
				folio_test_writeback(folio))
656 657 658
			break;
	}
	rcu_read_unlock();
659
	return folio != NULL;
660
}
661
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
662

663 664 665 666 667 668
/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:	the address_space for the pages
 * @lstart:	offset in bytes where the range starts
 * @lend:	offset in bytes where the range ends (inclusive)
 *
669 670
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
671
 * Note that @lend is inclusive (describes the last byte to be written) so
672
 * that this function can be used to write to the very end-of-file (end = -1).
673 674
 *
 * Return: error status of the address space.
675
 */
Linus Torvalds's avatar
Linus Torvalds committed
676 677 678
int filemap_write_and_wait_range(struct address_space *mapping,
				 loff_t lstart, loff_t lend)
{
679
	int err = 0, err2;
Linus Torvalds's avatar
Linus Torvalds committed
680

681 682 683
	if (lend < lstart)
		return 0;

684
	if (mapping_needs_writeback(mapping)) {
685 686
		err = __filemap_fdatawrite_range(mapping, lstart, lend,
						 WB_SYNC_ALL);
687 688 689 690 691 692
		/*
		 * Even if the above returned error, the pages may be
		 * written partially (e.g. -ENOSPC), so we wait for it.
		 * But the -EIO is special case, it may indicate the worst
		 * thing (e.g. bug) happened, so we avoid waiting for it.
		 */
693 694
		if (err != -EIO)
			__filemap_fdatawait_range(mapping, lstart, lend);
Linus Torvalds's avatar
Linus Torvalds committed
695
	}
696 697 698
	err2 = filemap_check_errors(mapping);
	if (!err)
		err = err2;
699
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
700
}
701
EXPORT_SYMBOL(filemap_write_and_wait_range);
Linus Torvalds's avatar
Linus Torvalds committed
702

703 704
void __filemap_set_wb_err(struct address_space *mapping, int err)
{
705
	errseq_t eseq = errseq_set(&mapping->wb_err, err);
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731

	trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
 * file_check_and_advance_wb_err - report wb error (if any) that was previously
 * 				   and advance wb_err to current one
 * @file: struct file on which the error is being reported
 *
 * When userland calls fsync (or something like nfsd does the equivalent), we
 * want to report any writeback errors that occurred since the last fsync (or
 * since the file was opened if there haven't been any).
 *
 * Grab the wb_err from the mapping. If it matches what we have in the file,
 * then just quickly return 0. The file is all caught up.
 *
 * If it doesn't match, then take the mapping value, set the "seen" flag in
 * it and try to swap it into place. If it works, or another task beat us
 * to it with the new value, then update the f_wb_err and return the error
 * portion. The error at this point must be reported via proper channels
 * (a'la fsync, or NFS COMMIT operation, etc.).
 *
 * While we handle mapping->wb_err with atomic operations, the f_wb_err
 * value is protected by the f_lock since we must ensure that it reflects
 * the latest value swapped in for this file descriptor.
732 733
 *
 * Return: %0 on success, negative error code otherwise.
734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
 */
int file_check_and_advance_wb_err(struct file *file)
{
	int err = 0;
	errseq_t old = READ_ONCE(file->f_wb_err);
	struct address_space *mapping = file->f_mapping;

	/* Locklessly handle the common case where nothing has changed */
	if (errseq_check(&mapping->wb_err, old)) {
		/* Something changed, must use slow path */
		spin_lock(&file->f_lock);
		old = file->f_wb_err;
		err = errseq_check_and_advance(&mapping->wb_err,
						&file->f_wb_err);
		trace_file_check_and_advance_wb_err(file, old);
		spin_unlock(&file->f_lock);
	}
751 752 753 754 755 756 757 758

	/*
	 * We're mostly using this function as a drop in replacement for
	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
	 * that the legacy code would have had on these flags.
	 */
	clear_bit(AS_EIO, &mapping->flags);
	clear_bit(AS_ENOSPC, &mapping->flags);
759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
	return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
 * file_write_and_wait_range - write out & wait on a file range
 * @file:	file pointing to address_space with pages
 * @lstart:	offset in bytes where the range starts
 * @lend:	offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * After writing out and waiting on the data, we check and advance the
 * f_wb_err cursor to the latest value, and return any errors detected there.
776 777
 *
 * Return: %0 on success, negative error code otherwise.
778 779 780 781 782 783
 */
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
	int err = 0, err2;
	struct address_space *mapping = file->f_mapping;

784 785 786
	if (lend < lstart)
		return 0;

787
	if (mapping_needs_writeback(mapping)) {
788 789 790 791 792 793 794 795 796 797 798 799 800
		err = __filemap_fdatawrite_range(mapping, lstart, lend,
						 WB_SYNC_ALL);
		/* See comment of filemap_write_and_wait() */
		if (err != -EIO)
			__filemap_fdatawait_range(mapping, lstart, lend);
	}
	err2 = file_check_and_advance_wb_err(file);
	if (!err)
		err = err2;
	return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

801
/**
802 803 804 805 806 807 808 809
 * replace_page_cache_folio - replace a pagecache folio with a new one
 * @old:	folio to be replaced
 * @new:	folio to replace with
 *
 * This function replaces a folio in the pagecache with a new one.  On
 * success it acquires the pagecache reference for the new folio and
 * drops it for the old folio.  Both the old and new folios must be
 * locked.  This function does not add the new folio to the LRU, the
810 811
 * caller must do that.
 *
812
 * The remove + add is atomic.  This function cannot fail.
813
 */
814
void replace_page_cache_folio(struct folio *old, struct folio *new)
815
{
816
	struct address_space *mapping = old->mapping;
817
	void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
818 819
	pgoff_t offset = old->index;
	XA_STATE(xas, &mapping->i_pages, offset);
820

821 822 823
	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
	VM_BUG_ON_FOLIO(new->mapping, new);
824

825
	folio_get(new);
826 827
	new->mapping = mapping;
	new->index = offset;
828

829
	mem_cgroup_replace_folio(old, new);
830

831
	xas_lock_irq(&xas);
832
	xas_store(&xas, new);
833

834 835
	old->mapping = NULL;
	/* hugetlb pages do not participate in page cache accounting. */
836 837 838 839 840 841 842 843
	if (!folio_test_hugetlb(old))
		__lruvec_stat_sub_folio(old, NR_FILE_PAGES);
	if (!folio_test_hugetlb(new))
		__lruvec_stat_add_folio(new, NR_FILE_PAGES);
	if (folio_test_swapbacked(old))
		__lruvec_stat_sub_folio(old, NR_SHMEM);
	if (folio_test_swapbacked(new))
		__lruvec_stat_add_folio(new, NR_SHMEM);
844
	xas_unlock_irq(&xas);
845
	if (free_folio)
846 847
		free_folio(old);
	folio_put(old);
848
}
849
EXPORT_SYMBOL_GPL(replace_page_cache_folio);
850

851 852
noinline int __filemap_add_folio(struct address_space *mapping,
		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
Linus Torvalds's avatar
Linus Torvalds committed
853
{
854
	XA_STATE(xas, &mapping->i_pages, index);
855 856 857
	void *alloced_shadow = NULL;
	int alloced_order = 0;
	bool huge;
858
	long nr;
859

860 861
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
862 863
	VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
			folio);
864
	mapping_set_update(&xas, mapping);
865

866 867
	VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
	xas_set_order(&xas, index, folio_order(folio));
868
	huge = folio_test_hugetlb(folio);
869 870
	nr = folio_nr_pages(folio);

871
	gfp &= GFP_RECLAIM_MASK;
872 873 874
	folio_ref_add(folio, nr);
	folio->mapping = mapping;
	folio->index = xas.xa_index;
875

876 877
	for (;;) {
		int order = -1, split_order = 0;
878 879
		void *entry, *old = NULL;

880
		xas_lock_irq(&xas);
881 882 883 884 885 886
		xas_for_each_conflict(&xas, entry) {
			old = entry;
			if (!xa_is_value(entry)) {
				xas_set_err(&xas, -EEXIST);
				goto unlock;
			}
887 888 889 890 891 892 893 894 895 896 897 898
			/*
			 * If a larger entry exists,
			 * it will be the first and only entry iterated.
			 */
			if (order == -1)
				order = xas_get_order(&xas);
		}

		/* entry may have changed before we re-acquire the lock */
		if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
			xas_destroy(&xas);
			alloced_order = 0;
899 900 901
		}

		if (old) {
902
			if (order > 0 && order > folio_order(folio)) {
903 904
				/* How to handle large swap entries? */
				BUG_ON(shmem_mapping(mapping));
905 906 907 908
				if (!alloced_order) {
					split_order = order;
					goto unlock;
				}
909 910 911
				xas_split(&xas, old, order);
				xas_reset(&xas);
			}
912 913
			if (shadowp)
				*shadowp = old;
914 915
		}

916
		xas_store(&xas, folio);
917 918 919
		if (xas_error(&xas))
			goto unlock;

920
		mapping->nrpages += nr;
921 922

		/* hugetlb pages do not participate in page cache accounting */
923 924 925 926 927 928
		if (!huge) {
			__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
			if (folio_test_pmd_mappable(folio))
				__lruvec_stat_mod_folio(folio,
						NR_FILE_THPS, nr);
		}
929

930 931
unlock:
		xas_unlock_irq(&xas);
932 933 934 935 936 937 938 939 940 941 942 943 944 945 946

		/* split needed, alloc here and retry. */
		if (split_order) {
			xas_split_alloc(&xas, old, split_order, gfp);
			if (xas_error(&xas))
				goto error;
			alloced_shadow = old;
			alloced_order = split_order;
			xas_reset(&xas);
			continue;
		}

		if (!xas_nomem(&xas, gfp))
			break;
	}
947

948
	if (xas_error(&xas))
949
		goto error;
950

951
	trace_mm_filemap_add_to_page_cache(folio);
952
	return 0;
953
error:
954
	folio->mapping = NULL;
955
	/* Leave page->index set: truncation relies upon it */
956 957
	folio_put_refs(folio, nr);
	return xas_error(&xas);
Linus Torvalds's avatar
Linus Torvalds committed
958
}
959
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
960

961 962
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
				pgoff_t index, gfp_t gfp)
Linus Torvalds's avatar
Linus Torvalds committed
963
{
964
	void *shadow = NULL;
965 966
	int ret;

967 968 969 970
	ret = mem_cgroup_charge(folio, NULL, gfp);
	if (ret)
		return ret;

971 972
	__folio_set_locked(folio);
	ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
973 974
	if (unlikely(ret)) {
		mem_cgroup_uncharge(folio);
975
		__folio_clear_locked(folio);
976
	} else {
977
		/*
978
		 * The folio might have been evicted from cache only
979
		 * recently, in which case it should be activated like
980 981
		 * any other repeatedly accessed folio.
		 * The exception is folios getting rewritten; evicting other
982 983
		 * data from the working set, only to cache data that will
		 * get overwritten with something else, is a waste of memory.
984
		 */
985 986 987 988
		WARN_ON_ONCE(folio_test_active(folio));
		if (!(gfp & __GFP_WRITE) && shadow)
			workingset_refault(folio, shadow);
		folio_add_lru(folio);
989
	}
Linus Torvalds's avatar
Linus Torvalds committed
990 991
	return ret;
}
992
EXPORT_SYMBOL_GPL(filemap_add_folio);
Linus Torvalds's avatar
Linus Torvalds committed
993

994
#ifdef CONFIG_NUMA
995
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
996
{
997
	int n;
998
	struct folio *folio;
999

1000
	if (cpuset_do_page_mem_spread()) {
1001 1002
		unsigned int cpuset_mems_cookie;
		do {
1003
			cpuset_mems_cookie = read_mems_allowed_begin();
1004
			n = cpuset_mem_spread_node();
1005
			folio = __folio_alloc_node_noprof(gfp, order, n);
1006
		} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
1007

1008
		return folio;
1009
	}
1010
	return folio_alloc_noprof(gfp, order);
1011
}
1012
EXPORT_SYMBOL(filemap_alloc_folio_noprof);
1013 1014
#endif

1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
/*
 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
 *
 * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to lock
 * @mapping2: the second mapping to lock
 */
void filemap_invalidate_lock_two(struct address_space *mapping1,
				 struct address_space *mapping2)
{
	if (mapping1 > mapping2)
		swap(mapping1, mapping2);
	if (mapping1)
		down_write(&mapping1->invalidate_lock);
	if (mapping2 && mapping1 != mapping2)
		down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);

/*
 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
 *
 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to unlock
 * @mapping2: the second mapping to unlock
 */
void filemap_invalidate_unlock_two(struct address_space *mapping1,
				   struct address_space *mapping2)
{
	if (mapping1)
		up_write(&mapping1->invalidate_lock);
	if (mapping2 && mapping1 != mapping2)
		up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);

Linus Torvalds's avatar
Linus Torvalds committed
1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
1063 1064
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
1065
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
1066

1067
static wait_queue_head_t *folio_waitqueue(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1068
{
1069
	return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
Linus Torvalds's avatar
Linus Torvalds committed
1070 1071
}

1072
void __init pagecache_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
1073
{
1074
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
1075

1076
	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
1077
		init_waitqueue_head(&folio_wait_table[i]);
1078 1079

	page_writeback_init();
Linus Torvalds's avatar
Linus Torvalds committed
1080 1081
}

1082 1083
/*
 * The page wait code treats the "wait->flags" somewhat unusually, because
1084
 * we have multiple different kinds of waits, not just the usual "exclusive"
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
 * one.
 *
 * We have:
 *
 *  (a) no special bits set:
 *
 *	We're just waiting for the bit to be released, and when a waker
 *	calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
 *	and remove it from the wait queue.
 *
 *	Simple and straightforward.
 *
 *  (b) WQ_FLAG_EXCLUSIVE:
 *
 *	The waiter is waiting to get the lock, and only one waiter should
 *	be woken up to avoid any thundering herd behavior. We'll set the
 *	WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
 *
 *	This is the traditional exclusive wait.
 *
1105
 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
 *
 *	The waiter is waiting to get the bit, and additionally wants the
 *	lock to be transferred to it for fair lock behavior. If the lock
 *	cannot be taken, we stop walking the wait queue without waking
 *	the waiter.
 *
 *	This is the "fair lock handoff" case, and in addition to setting
 *	WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
 *	that it now has the lock.
 */
1116
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
1117
{
1118
	unsigned int flags;
1119 1120 1121 1122
	struct wait_page_key *key = arg;
	struct wait_page_queue *wait_page
		= container_of(wait, struct wait_page_queue, wait);

1123
	if (!wake_page_match(wait_page, key))
1124
		return 0;
1125

1126
	/*
1127 1128
	 * If it's a lock handoff wait, we get the bit for it, and
	 * stop walking (and do not wake it up) if we can't.
1129
	 */
1130 1131
	flags = wait->flags;
	if (flags & WQ_FLAG_EXCLUSIVE) {
1132
		if (test_bit(key->bit_nr, &key->folio->flags))
1133
			return -1;
1134
		if (flags & WQ_FLAG_CUSTOM) {
1135
			if (test_and_set_bit(key->bit_nr, &key->folio->flags))
1136 1137 1138
				return -1;
			flags |= WQ_FLAG_DONE;
		}
1139
	}
1140

1141 1142 1143 1144 1145 1146 1147
	/*
	 * We are holding the wait-queue lock, but the waiter that
	 * is waiting for this will be checking the flags without
	 * any locking.
	 *
	 * So update the flags atomically, and wake up the waiter
	 * afterwards to avoid any races. This store-release pairs
1148
	 * with the load-acquire in folio_wait_bit_common().
1149 1150
	 */
	smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
1151 1152 1153 1154 1155 1156
	wake_up_state(wait->private, mode);

	/*
	 * Ok, we have successfully done what we're waiting for,
	 * and we can unconditionally remove the wait entry.
	 *
1157 1158 1159
	 * Note that this pairs with the "finish_wait()" in the
	 * waiter, and has to be the absolute last thing we do.
	 * After this list_del_init(&wait->entry) the wait entry
1160 1161 1162
	 * might be de-allocated and the process might even have
	 * exited.
	 */
1163
	list_del_init_careful(&wait->entry);
1164
	return (flags & WQ_FLAG_EXCLUSIVE) != 0;
1165 1166
}

1167
static void folio_wake_bit(struct folio *folio, int bit_nr)
1168
{
1169
	wait_queue_head_t *q = folio_waitqueue(folio);
1170 1171
	struct wait_page_key key;
	unsigned long flags;
1172

1173
	key.folio = folio;
1174 1175 1176 1177
	key.bit_nr = bit_nr;
	key.page_match = 0;

	spin_lock_irqsave(&q->lock, flags);
1178
	__wake_up_locked_key(q, TASK_NORMAL, &key);
1179

1180
	/*
1181 1182 1183
	 * It's possible to miss clearing waiters here, when we woke our page
	 * waiters, but the hashed waitqueue has waiters for other pages on it.
	 * That's okay, it's a rare case. The next waker will clear it.
1184
	 *
1185 1186 1187
	 * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
	 * other), the flag may be cleared in the course of freeing the page;
	 * but that is not required for correctness.
1188
	 */
1189
	if (!waitqueue_active(q) || !key.page_match)
1190
		folio_clear_waiters(folio);
1191

1192 1193
	spin_unlock_irqrestore(&q->lock, flags);
}
1194

1195
/*
1196
 * A choice of three behaviors for folio_wait_bit_common():
1197 1198 1199
 */
enum behavior {
	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
1200
			 * __folio_lock() waiting on then setting PG_locked.
1201 1202
			 */
	SHARED,		/* Hold ref to page and check the bit when woken, like
1203
			 * folio_wait_writeback() waiting on PG_writeback.
1204 1205
			 */
	DROP,		/* Drop ref to page before wait, no check when woken,
1206
			 * like folio_put_wait_locked() on PG_locked.
1207 1208 1209
			 */
};

1210
/*
1211
 * Attempt to check (or get) the folio flag, and mark us done
1212
 * if successful.
1213
 */
1214
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
1215 1216 1217
					struct wait_queue_entry *wait)
{
	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
1218
		if (test_and_set_bit(bit_nr, &folio->flags))
1219
			return false;
1220
	} else if (test_bit(bit_nr, &folio->flags))
1221 1222
		return false;

1223
	wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
1224 1225 1226
	return true;
}

1227 1228 1229
/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;

1230 1231
static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
		int state, enum behavior behavior)
1232
{
1233
	wait_queue_head_t *q = folio_waitqueue(folio);
1234
	int unfairness = sysctl_page_lock_unfairness;
1235
	struct wait_page_queue wait_page;
1236
	wait_queue_entry_t *wait = &wait_page.wait;
1237
	bool thrashing = false;
1238
	unsigned long pflags;
1239
	bool in_thrashing;
1240

1241
	if (bit_nr == PG_locked &&
1242
	    !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1243
		delayacct_thrashing_start(&in_thrashing);
1244
		psi_memstall_enter(&pflags);
1245 1246 1247
		thrashing = true;
	}

1248 1249
	init_wait(wait);
	wait->func = wake_page_function;
1250
	wait_page.folio = folio;
1251 1252
	wait_page.bit_nr = bit_nr;

1253 1254 1255 1256 1257 1258 1259 1260
repeat:
	wait->flags = 0;
	if (behavior == EXCLUSIVE) {
		wait->flags = WQ_FLAG_EXCLUSIVE;
		if (--unfairness < 0)
			wait->flags |= WQ_FLAG_CUSTOM;
	}

1261 1262 1263 1264
	/*
	 * Do one last check whether we can get the
	 * page bit synchronously.
	 *
1265
	 * Do the folio_set_waiters() marking before that
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
	 * to let any waker we _just_ missed know they
	 * need to wake us up (otherwise they'll never
	 * even go to the slow case that looks at the
	 * page queue), and add ourselves to the wait
	 * queue if we need to sleep.
	 *
	 * This part needs to be done under the queue
	 * lock to avoid races.
	 */
	spin_lock_irq(&q->lock);
1276 1277
	folio_set_waiters(folio);
	if (!folio_trylock_flag(folio, bit_nr, wait))
1278 1279
		__add_wait_queue_entry_tail(q, wait);
	spin_unlock_irq(&q->lock);
1280

1281 1282
	/*
	 * From now on, all the logic will be based on
1283 1284 1285
	 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
	 * see whether the page bit testing has already
	 * been done by the wake function.
1286
	 *
1287
	 * We can drop our reference to the folio.
1288 1289
	 */
	if (behavior == DROP)
1290
		folio_put(folio);
1291

1292 1293 1294 1295 1296 1297
	/*
	 * Note that until the "finish_wait()", or until
	 * we see the WQ_FLAG_WOKEN flag, we need to
	 * be very careful with the 'wait->flags', because
	 * we may race with a waker that sets them.
	 */
1298
	for (;;) {
1299 1300
		unsigned int flags;

1301 1302
		set_current_state(state);

1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314
		/* Loop until we've been woken or interrupted */
		flags = smp_load_acquire(&wait->flags);
		if (!(flags & WQ_FLAG_WOKEN)) {
			if (signal_pending_state(state, current))
				break;

			io_schedule();
			continue;
		}

		/* If we were non-exclusive, we're done */
		if (behavior != EXCLUSIVE)
1315
			break;
1316

1317 1318
		/* If the waker got the lock for us, we're done */
		if (flags & WQ_FLAG_DONE)
1319
			break;
1320

1321 1322 1323 1324 1325 1326
		/*
		 * Otherwise, if we're getting the lock, we need to
		 * try to get it ourselves.
		 *
		 * And if that fails, we'll have to retry this all.
		 */
1327
		if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
1328 1329 1330 1331
			goto repeat;

		wait->flags |= WQ_FLAG_DONE;
		break;
1332 1333
	}

1334 1335
	/*
	 * If a signal happened, this 'finish_wait()' may remove the last
1336
	 * waiter from the wait-queues, but the folio waiters bit will remain
1337 1338 1339
	 * set. That's ok. The next wakeup will take care of it, and trying
	 * to do it here would be difficult and prone to races.
	 */
1340 1341
	finish_wait(q, wait);

1342
	if (thrashing) {
1343
		delayacct_thrashing_end(&in_thrashing);
1344 1345
		psi_memstall_leave(&pflags);
	}
1346

1347
	/*
1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358
	 * NOTE! The wait->flags weren't stable until we've done the
	 * 'finish_wait()', and we could have exited the loop above due
	 * to a signal, and had a wakeup event happen after the signal
	 * test but before the 'finish_wait()'.
	 *
	 * So only after the finish_wait() can we reliably determine
	 * if we got woken up or not, so we can now figure out the final
	 * return value based on that state without races.
	 *
	 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
	 * waiter, but an exclusive one requires WQ_FLAG_DONE.
1359
	 */
1360 1361
	if (behavior == EXCLUSIVE)
		return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
1362

1363
	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
1364 1365
}

1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377
#ifdef CONFIG_MIGRATION
/**
 * migration_entry_wait_on_locked - Wait for a migration entry to be removed
 * @entry: migration swap entry.
 * @ptl: already locked ptl. This function will drop the lock.
 *
 * Wait for a migration entry referencing the given page to be removed. This is
 * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
 * this can be called without taking a reference on the page. Instead this
 * should be called while holding the ptl for the migration entry referencing
 * the page.
 *
1378
 * Returns after unlocking the ptl.
1379 1380 1381 1382
 *
 * This follows the same logic as folio_wait_bit_common() so see the comments
 * there.
 */
1383 1384
void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
	__releases(ptl)
1385 1386 1387 1388 1389
{
	struct wait_page_queue wait_page;
	wait_queue_entry_t *wait = &wait_page.wait;
	bool thrashing = false;
	unsigned long pflags;
1390
	bool in_thrashing;
1391
	wait_queue_head_t *q;
1392
	struct folio *folio = pfn_swap_entry_folio(entry);
1393 1394 1395

	q = folio_waitqueue(folio);
	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1396
		delayacct_thrashing_start(&in_thrashing);
1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417
		psi_memstall_enter(&pflags);
		thrashing = true;
	}

	init_wait(wait);
	wait->func = wake_page_function;
	wait_page.folio = folio;
	wait_page.bit_nr = PG_locked;
	wait->flags = 0;

	spin_lock_irq(&q->lock);
	folio_set_waiters(folio);
	if (!folio_trylock_flag(folio, PG_locked, wait))
		__add_wait_queue_entry_tail(q, wait);
	spin_unlock_irq(&q->lock);

	/*
	 * If a migration entry exists for the page the migration path must hold
	 * a valid reference to the page, and it must take the ptl to remove the
	 * migration entry. So the page is valid until the ptl is dropped.
	 */
1418
	spin_unlock(ptl);
1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439

	for (;;) {
		unsigned int flags;

		set_current_state(TASK_UNINTERRUPTIBLE);

		/* Loop until we've been woken or interrupted */
		flags = smp_load_acquire(&wait->flags);
		if (!(flags & WQ_FLAG_WOKEN)) {
			if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
				break;

			io_schedule();
			continue;
		}
		break;
	}

	finish_wait(q, wait);

	if (thrashing) {
1440
		delayacct_thrashing_end(&in_thrashing);
1441 1442 1443 1444 1445
		psi_memstall_leave(&pflags);
	}
}
#endif

1446
void folio_wait_bit(struct folio *folio, int bit_nr)
1447
{
1448
	folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1449
}
1450
EXPORT_SYMBOL(folio_wait_bit);
1451

1452
int folio_wait_bit_killable(struct folio *folio, int bit_nr)
1453
{
1454
	return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
1455
}
1456
EXPORT_SYMBOL(folio_wait_bit_killable);
1457

1458
/**
1459 1460
 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
 * @folio: The folio to wait for.
1461
 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
1462
 *
1463
 * The caller should hold a reference on @folio.  They expect the page to
1464
 * become unlocked relatively soon, but do not wish to hold up migration
1465
 * (for example) by holding the reference while waiting for the folio to
1466
 * come unlocked.  After this function returns, the caller should not
1467
 * dereference @folio.
1468
 *
1469
 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
1470
 */
1471
static int folio_put_wait_locked(struct folio *folio, int state)
1472
{
1473
	return folio_wait_bit_common(folio, PG_locked, state, DROP);
1474 1475
}

1476
/**
1477 1478
 * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
 * @folio: Folio defining the wait queue of interest
1479
 * @waiter: Waiter to add to the queue
1480
 *
1481
 * Add an arbitrary @waiter to the wait queue for the nominated @folio.
1482
 */
1483
void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
1484
{
1485
	wait_queue_head_t *q = folio_waitqueue(folio);
1486 1487 1488
	unsigned long flags;

	spin_lock_irqsave(&q->lock, flags);
1489
	__add_wait_queue_entry_tail(q, waiter);
1490
	folio_set_waiters(folio);
1491 1492
	spin_unlock_irqrestore(&q->lock, flags);
}
1493
EXPORT_SYMBOL_GPL(folio_add_wait_queue);
1494

Linus Torvalds's avatar
Linus Torvalds committed
1495
/**
1496 1497 1498 1499 1500 1501 1502
 * folio_unlock - Unlock a locked folio.
 * @folio: The folio.
 *
 * Unlocks the folio and wakes up any thread sleeping on the page lock.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
Linus Torvalds's avatar
Linus Torvalds committed
1503
 */
1504
void folio_unlock(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1505
{
1506
	/* Bit 7 allows x86 to check the byte's sign bit */
1507
	BUILD_BUG_ON(PG_waiters != 7);
1508 1509
	BUILD_BUG_ON(PG_locked > 7);
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1510
	if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
1511
		folio_wake_bit(folio, PG_locked);
Linus Torvalds's avatar
Linus Torvalds committed
1512
}
1513
EXPORT_SYMBOL(folio_unlock);
Linus Torvalds's avatar
Linus Torvalds committed
1514

1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530
/**
 * folio_end_read - End read on a folio.
 * @folio: The folio.
 * @success: True if all reads completed successfully.
 *
 * When all reads against a folio have completed, filesystems should
 * call this function to let the pagecache know that no more reads
 * are outstanding.  This will unlock the folio and wake up any thread
 * sleeping on the lock.  The folio will also be marked uptodate if all
 * reads succeeded.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_end_read(struct folio *folio, bool success)
{
1531 1532 1533 1534 1535 1536 1537
	unsigned long mask = 1 << PG_locked;

	/* Must be in bottom byte for x86 to work */
	BUILD_BUG_ON(PG_uptodate > 7);
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);

1538
	if (likely(success))
1539 1540 1541
		mask |= 1 << PG_uptodate;
	if (folio_xor_flags_has_waiters(folio, mask))
		folio_wake_bit(folio, PG_locked);
1542 1543 1544
}
EXPORT_SYMBOL(folio_end_read);

1545
/**
1546 1547
 * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
 * @folio: The folio.
1548
 *
1549 1550
 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
 * it.  The folio reference held for PG_private_2 being set is released.
1551
 *
1552 1553
 * This is, for example, used when a netfs folio is being written to a local
 * disk cache, thereby allowing writes to the cache for the same folio to be
1554 1555
 * serialised.
 */
1556
void folio_end_private_2(struct folio *folio)
1557
{
1558 1559 1560 1561
	VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
	clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
	folio_wake_bit(folio, PG_private_2);
	folio_put(folio);
1562
}
1563
EXPORT_SYMBOL(folio_end_private_2);
1564 1565

/**
1566 1567
 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
1568
 *
1569
 * Wait for PG_private_2 to be cleared on a folio.
1570
 */
1571
void folio_wait_private_2(struct folio *folio)
1572
{
1573 1574
	while (folio_test_private_2(folio))
		folio_wait_bit(folio, PG_private_2);
1575
}
1576
EXPORT_SYMBOL(folio_wait_private_2);
1577 1578

/**
1579 1580
 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
1581
 *
1582 1583
 * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
 * received by the calling task.
1584 1585 1586 1587 1588
 *
 * Return:
 * - 0 if successful.
 * - -EINTR if a fatal signal was encountered.
 */
1589
int folio_wait_private_2_killable(struct folio *folio)
1590 1591 1592
{
	int ret = 0;

1593 1594
	while (folio_test_private_2(folio)) {
		ret = folio_wait_bit_killable(folio, PG_private_2);
1595 1596 1597 1598 1599 1600
		if (ret < 0)
			break;
	}

	return ret;
}
1601
EXPORT_SYMBOL(folio_wait_private_2_killable);
1602

1603
/**
1604 1605
 * folio_end_writeback - End writeback against a folio.
 * @folio: The folio.
1606 1607 1608 1609
 *
 * The folio must actually be under writeback.
 *
 * Context: May be called from process or interrupt context.
Linus Torvalds's avatar
Linus Torvalds committed
1610
 */
1611
void folio_end_writeback(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1612
{
1613 1614
	VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

1615
	/*
1616 1617 1618 1619 1620
	 * folio_test_clear_reclaim() could be used here but it is an
	 * atomic operation and overkill in this particular case. Failing
	 * to shuffle a folio marked for immediate reclaim is too mild
	 * a gain to justify taking an atomic operation penalty at the
	 * end of every folio writeback.
1621
	 */
1622 1623
	if (folio_test_reclaim(folio)) {
		folio_clear_reclaim(folio);
1624
		folio_rotate_reclaimable(folio);
1625
	}
1626

1627
	/*
1628
	 * Writeback does not hold a folio reference of its own, relying
1629
	 * on truncation to wait for the clearing of PG_writeback.
1630
	 * But here we must make sure that the folio is not freed and
1631
	 * reused before the folio_wake_bit().
1632
	 */
1633
	folio_get(folio);
1634 1635
	if (__folio_end_writeback(folio))
		folio_wake_bit(folio, PG_writeback);
1636
	acct_reclaim_writeback(folio);
1637
	folio_put(folio);
Linus Torvalds's avatar
Linus Torvalds committed
1638
}
1639
EXPORT_SYMBOL(folio_end_writeback);
Linus Torvalds's avatar
Linus Torvalds committed
1640

1641
/**
1642 1643
 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
 * @folio: The folio to lock
Linus Torvalds's avatar
Linus Torvalds committed
1644
 */
1645
void __folio_lock(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1646
{
1647
	folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
1648
				EXCLUSIVE);
Linus Torvalds's avatar
Linus Torvalds committed
1649
}
1650
EXPORT_SYMBOL(__folio_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1651

1652
int __folio_lock_killable(struct folio *folio)
Matthew Wilcox's avatar
Matthew Wilcox committed
1653
{
1654
	return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
1655
					EXCLUSIVE);
Matthew Wilcox's avatar
Matthew Wilcox committed
1656
}
1657
EXPORT_SYMBOL_GPL(__folio_lock_killable);
Matthew Wilcox's avatar
Matthew Wilcox committed
1658

1659
static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
1660
{
1661
	struct wait_queue_head *q = folio_waitqueue(folio);
1662
	int ret;
1663

1664
	wait->folio = folio;
1665 1666 1667 1668
	wait->bit_nr = PG_locked;

	spin_lock_irq(&q->lock);
	__add_wait_queue_entry_tail(q, &wait->wait);
1669 1670
	folio_set_waiters(folio);
	ret = !folio_trylock(folio);
1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
	/*
	 * If we were successful now, we know we're still on the
	 * waitqueue as we're still under the lock. This means it's
	 * safe to remove and return success, we know the callback
	 * isn't going to trigger.
	 */
	if (!ret)
		__remove_wait_queue(q, &wait->wait);
	else
		ret = -EIOCBQUEUED;
	spin_unlock_irq(&q->lock);
	return ret;
1683 1684
}

1685 1686
/*
 * Return values:
1687 1688
 * 0 - folio is locked.
 * non-zero - folio is not locked.
1689 1690 1691
 *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
 *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
 *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
1692
 *
1693
 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
1694
 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
1695
 */
1696
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
1697
{
1698 1699
	unsigned int flags = vmf->flags;

1700
	if (fault_flag_allow_retry_first(flags)) {
1701
		/*
1702 1703
		 * CAUTION! In this case, mmap_lock/per-VMA lock is not
		 * released even though returning VM_FAULT_RETRY.
1704 1705
		 */
		if (flags & FAULT_FLAG_RETRY_NOWAIT)
1706
			return VM_FAULT_RETRY;
1707

1708
		release_fault_lock(vmf);
1709
		if (flags & FAULT_FLAG_KILLABLE)
1710
			folio_wait_locked_killable(folio);
1711
		else
1712
			folio_wait_locked(folio);
1713
		return VM_FAULT_RETRY;
1714 1715
	}
	if (flags & FAULT_FLAG_KILLABLE) {
1716
		bool ret;
1717

1718
		ret = __folio_lock_killable(folio);
1719
		if (ret) {
1720
			release_fault_lock(vmf);
1721
			return VM_FAULT_RETRY;
1722 1723
		}
	} else {
1724
		__folio_lock(folio);
1725
	}
1726

1727
	return 0;
1728 1729
}

1730
/**
1731 1732 1733 1734
 * page_cache_next_miss() - Find the next gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
1735
 *
1736 1737
 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 * gap with the lowest index.
1738
 *
1739 1740 1741 1742 1743
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 5, then subsequently a gap is
 * created at index 10, page_cache_next_miss covering both indices may
 * return 10 if called under the rcu_read_lock.
1744
 *
1745 1746
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'return - index >= max_scan' will be true).
1747
 * In the rare case of index wrap-around, 0 will be returned.
1748
 */
1749
pgoff_t page_cache_next_miss(struct address_space *mapping,
1750 1751
			     pgoff_t index, unsigned long max_scan)
{
1752
	XA_STATE(xas, &mapping->i_pages, index);
1753

1754 1755 1756
	while (max_scan--) {
		void *entry = xas_next(&xas);
		if (!entry || xa_is_value(entry))
1757
			return xas.xa_index;
1758
		if (xas.xa_index == 0)
1759
			return 0;
1760 1761
	}

1762
	return index + max_scan;
1763
}
1764
EXPORT_SYMBOL(page_cache_next_miss);
1765 1766

/**
1767
 * page_cache_prev_miss() - Find the previous gap in the page cache.
1768 1769 1770
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
1771
 *
1772 1773
 * Search the range [max(index - max_scan + 1, 0), index] for the
 * gap with the highest index.
1774
 *
1775 1776 1777 1778 1779
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 10, then subsequently a gap is
 * created at index 5, page_cache_prev_miss() covering both indices may
 * return 5 if called under the rcu_read_lock.
1780
 *
1781 1782
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'index - return >= max_scan' will be true).
1783
 * In the rare case of wrap-around, ULONG_MAX will be returned.
1784
 */
1785
pgoff_t page_cache_prev_miss(struct address_space *mapping,
1786 1787
			     pgoff_t index, unsigned long max_scan)
{
1788
	XA_STATE(xas, &mapping->i_pages, index);
1789

1790 1791 1792
	while (max_scan--) {
		void *entry = xas_prev(&xas);
		if (!entry || xa_is_value(entry))
1793 1794 1795
			break;
		if (xas.xa_index == ULONG_MAX)
			break;
1796 1797
	}

1798
	return xas.xa_index;
1799
}
1800
EXPORT_SYMBOL(page_cache_prev_miss);
1801

1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
/*
 * Lockless page cache protocol:
 * On the lookup side:
 * 1. Load the folio from i_pages
 * 2. Increment the refcount if it's not zero
 * 3. If the folio is not found by xas_reload(), put the refcount and retry
 *
 * On the removal side:
 * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
 * B. Remove the page from i_pages
 * C. Return the page to the page allocator
 *
 * This means that any page may have its reference count temporarily
1815
 * increased by a speculative page cache (or GUP-fast) lookup as it can
1816 1817 1818 1819 1820 1821
 * be allocated by another user before the RCU grace period expires.
 * Because the refcount temporarily acquired here may end up being the
 * last refcount on the page, any page allocation must be freeable by
 * folio_put().
 */

1822
/*
1823
 * filemap_get_entry - Get a page cache entry.
1824
 * @mapping: the address_space to search
1825
 * @index: The page cache index.
1826
 *
1827 1828 1829 1830
 * Looks up the page cache entry at @mapping & @index.  If it is a folio,
 * it is returned with an increased refcount.  If it is a shadow entry
 * of a previously evicted folio, or a swap entry from shmem/tmpfs,
 * it is returned without further action.
1831
 *
1832
 * Return: The folio, swap or shadow entry, %NULL if nothing is found.
Linus Torvalds's avatar
Linus Torvalds committed
1833
 */
1834
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
Linus Torvalds's avatar
Linus Torvalds committed
1835
{
1836
	XA_STATE(xas, &mapping->i_pages, index);
1837
	struct folio *folio;
Linus Torvalds's avatar
Linus Torvalds committed
1838

Nick Piggin's avatar
Nick Piggin committed
1839 1840
	rcu_read_lock();
repeat:
1841
	xas_reset(&xas);
1842 1843
	folio = xas_load(&xas);
	if (xas_retry(&xas, folio))
1844 1845 1846 1847 1848
		goto repeat;
	/*
	 * A shadow entry of a recently evicted page, or a swap entry from
	 * shmem/tmpfs.  Return it without attempting to raise page count.
	 */
1849
	if (!folio || xa_is_value(folio))
1850
		goto out;
1851

1852
	if (!folio_try_get(folio))
1853
		goto repeat;
1854

1855 1856
	if (unlikely(folio != xas_reload(&xas))) {
		folio_put(folio);
1857
		goto repeat;
Nick Piggin's avatar
Nick Piggin committed
1858
	}
Nick Piggin's avatar
Nick Piggin committed
1859
out:
Nick Piggin's avatar
Nick Piggin committed
1860 1861
	rcu_read_unlock();

1862
	return folio;
Linus Torvalds's avatar
Linus Torvalds committed
1863 1864
}

1865
/**
1866
 * __filemap_get_folio - Find and get a reference to a folio.
1867 1868
 * @mapping: The address_space to search.
 * @index: The page index.
1869 1870
 * @fgp_flags: %FGP flags modify how the folio is returned.
 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
Linus Torvalds's avatar
Linus Torvalds committed
1871
 *
1872
 * Looks up the page cache entry at @mapping & @index.
1873
 *
1874 1875
 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
 * if the %GFP flags specified for %FGP_CREAT are atomic.
Linus Torvalds's avatar
Linus Torvalds committed
1876
 *
1877
 * If this function returns a folio, it is returned with an increased refcount.
1878
 *
1879
 * Return: The found folio or an ERR_PTR() otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
1880
 */
1881
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
1882
		fgf_t fgp_flags, gfp_t gfp)
Linus Torvalds's avatar
Linus Torvalds committed
1883
{
1884
	struct folio *folio;
1885

Linus Torvalds's avatar
Linus Torvalds committed
1886
repeat:
1887
	folio = filemap_get_entry(mapping, index);
Christoph Hellwig's avatar
Christoph Hellwig committed
1888
	if (xa_is_value(folio))
1889 1890
		folio = NULL;
	if (!folio)
1891 1892 1893 1894
		goto no_page;

	if (fgp_flags & FGP_LOCK) {
		if (fgp_flags & FGP_NOWAIT) {
1895 1896
			if (!folio_trylock(folio)) {
				folio_put(folio);
1897
				return ERR_PTR(-EAGAIN);
1898 1899
			}
		} else {
1900
			folio_lock(folio);
1901 1902 1903
		}

		/* Has the page been truncated? */
1904 1905 1906
		if (unlikely(folio->mapping != mapping)) {
			folio_unlock(folio);
			folio_put(folio);
1907 1908
			goto repeat;
		}
1909
		VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
1910 1911
	}

1912
	if (fgp_flags & FGP_ACCESSED)
1913
		folio_mark_accessed(folio);
1914 1915
	else if (fgp_flags & FGP_WRITE) {
		/* Clear idle flag for buffer write */
1916 1917
		if (folio_test_idle(folio))
			folio_clear_idle(folio);
1918
	}
1919

1920 1921
	if (fgp_flags & FGP_STABLE)
		folio_wait_stable(folio);
1922
no_page:
1923
	if (!folio && (fgp_flags & FGP_CREAT)) {
1924 1925
		unsigned int min_order = mapping_min_folio_order(mapping);
		unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
1926
		int err;
1927
		index = mapping_align_index(mapping, index);
1928

1929
		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
1930
			gfp |= __GFP_WRITE;
1931
		if (fgp_flags & FGP_NOFS)
1932
			gfp &= ~__GFP_FS;
1933 1934 1935 1936
		if (fgp_flags & FGP_NOWAIT) {
			gfp &= ~GFP_KERNEL;
			gfp |= GFP_NOWAIT | __GFP_NOWARN;
		}
1937
		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
1938 1939
			fgp_flags |= FGP_LOCK;

1940 1941
		if (order > mapping_max_folio_order(mapping))
			order = mapping_max_folio_order(mapping);
1942 1943 1944
		/* If we're not aligned, allocate a smaller folio */
		if (index & ((1UL << order) - 1))
			order = __ffs(index);
1945

1946 1947 1948 1949
		do {
			gfp_t alloc_gfp = gfp;

			err = -ENOMEM;
1950
			if (order > min_order)
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
				alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
			folio = filemap_alloc_folio(alloc_gfp, order);
			if (!folio)
				continue;

			/* Init accessed so avoid atomic mark_page_accessed later */
			if (fgp_flags & FGP_ACCESSED)
				__folio_set_referenced(folio);

			err = filemap_add_folio(mapping, folio, index, gfp);
			if (!err)
				break;
1963 1964
			folio_put(folio);
			folio = NULL;
1965
		} while (order-- > min_order);
1966

1967 1968 1969 1970
		if (err == -EEXIST)
			goto repeat;
		if (err)
			return ERR_PTR(err);
1971
		/*
1972 1973
		 * filemap_add_folio locks the page, and for mmap
		 * we expect an unlocked page.
1974
		 */
1975 1976
		if (folio && (fgp_flags & FGP_FOR_MMAP))
			folio_unlock(folio);
Linus Torvalds's avatar
Linus Torvalds committed
1977
	}
1978

1979 1980
	if (!folio)
		return ERR_PTR(-ENOENT);
1981
	return folio;
Linus Torvalds's avatar
Linus Torvalds committed
1982
}
1983
EXPORT_SYMBOL(__filemap_get_folio);
Linus Torvalds's avatar
Linus Torvalds committed
1984

1985
static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
1986 1987
		xa_mark_t mark)
{
1988
	struct folio *folio;
1989 1990 1991

retry:
	if (mark == XA_PRESENT)
1992
		folio = xas_find(xas, max);
1993
	else
1994
		folio = xas_find_marked(xas, max, mark);
1995

1996
	if (xas_retry(xas, folio))
1997 1998 1999 2000 2001 2002
		goto retry;
	/*
	 * A shadow entry of a recently evicted page, a swap
	 * entry from shmem/tmpfs or a DAX entry.  Return it
	 * without attempting to raise page count.
	 */
2003 2004
	if (!folio || xa_is_value(folio))
		return folio;
2005

2006
	if (!folio_try_get(folio))
2007 2008
		goto reset;

2009 2010
	if (unlikely(folio != xas_reload(xas))) {
		folio_put(folio);
2011 2012 2013
		goto reset;
	}

2014
	return folio;
2015 2016 2017 2018 2019
reset:
	xas_reset(xas);
	goto retry;
}

2020 2021 2022 2023
/**
 * find_get_entries - gang pagecache lookup
 * @mapping:	The address_space to search
 * @start:	The starting page cache index
2024
 * @end:	The final page index (inclusive).
2025
 * @fbatch:	Where the resulting entries are placed.
2026 2027
 * @indices:	The cache indices corresponding to the entries in @entries
 *
2028
 * find_get_entries() will search for and return a batch of entries in
2029 2030
 * the mapping.  The entries are placed in @fbatch.  find_get_entries()
 * takes a reference on any actual folios it returns.
2031
 *
2032 2033
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries or large folios.
2034
 *
2035
 * Any shadow entries of evicted folios, or swap entries from
2036
 * shmem/tmpfs, are included in the returned array.
2037
 *
2038
 * Return: The number of entries which were found.
2039
 */
2040
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
2041
		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2042
{
2043
	XA_STATE(xas, &mapping->i_pages, *start);
2044
	struct folio *folio;
2045 2046

	rcu_read_lock();
2047
	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
2048 2049
		indices[fbatch->nr] = xas.xa_index;
		if (!folio_batch_add(fbatch, folio))
2050 2051 2052
			break;
	}
	rcu_read_unlock();
2053

2054 2055 2056 2057 2058
	if (folio_batch_count(fbatch)) {
		unsigned long nr = 1;
		int idx = folio_batch_count(fbatch) - 1;

		folio = fbatch->folios[idx];
2059
		if (!xa_is_value(folio))
2060 2061 2062
			nr = folio_nr_pages(folio);
		*start = indices[idx] + nr;
	}
2063
	return folio_batch_count(fbatch);
2064 2065
}

2066 2067 2068 2069 2070
/**
 * find_lock_entries - Find a batch of pagecache entries.
 * @mapping:	The address_space to search.
 * @start:	The starting page cache index.
 * @end:	The final page index (inclusive).
2071 2072
 * @fbatch:	Where the resulting entries are placed.
 * @indices:	The cache indices of the entries in @fbatch.
2073 2074
 *
 * find_lock_entries() will return a batch of entries from @mapping.
2075 2076 2077 2078
 * Swap, shadow and DAX entries are included.  Folios are returned
 * locked and with an incremented refcount.  Folios which are locked
 * by somebody else or under writeback are skipped.  Folios which are
 * partially outside the range are not returned.
2079 2080
 *
 * The entries have ascending indexes.  The indices may not be consecutive
2081 2082
 * due to not-present entries, large folios, folios which could not be
 * locked or folios under writeback.
2083 2084 2085
 *
 * Return: The number of entries which were found.
 */
2086
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
2087
		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2088
{
2089
	XA_STATE(xas, &mapping->i_pages, *start);
2090
	struct folio *folio;
2091 2092

	rcu_read_lock();
2093 2094
	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
		if (!xa_is_value(folio)) {
2095
			if (folio->index < *start)
2096
				goto put;
2097
			if (folio_next_index(folio) - 1 > end)
2098
				goto put;
2099
			if (!folio_trylock(folio))
2100
				goto put;
2101 2102
			if (folio->mapping != mapping ||
			    folio_test_writeback(folio))
2103
				goto unlock;
2104 2105
			VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
					folio);
2106
		}
2107 2108
		indices[fbatch->nr] = xas.xa_index;
		if (!folio_batch_add(fbatch, folio))
2109
			break;
2110
		continue;
2111
unlock:
2112
		folio_unlock(folio);
2113
put:
2114
		folio_put(folio);
2115 2116 2117
	}
	rcu_read_unlock();

2118 2119 2120 2121 2122
	if (folio_batch_count(fbatch)) {
		unsigned long nr = 1;
		int idx = folio_batch_count(fbatch) - 1;

		folio = fbatch->folios[idx];
2123
		if (!xa_is_value(folio))
2124 2125 2126
			nr = folio_nr_pages(folio);
		*start = indices[idx] + nr;
	}
2127
	return folio_batch_count(fbatch);
2128 2129
}

Linus Torvalds's avatar
Linus Torvalds committed
2130
/**
2131
 * filemap_get_folios - Get a batch of folios
Linus Torvalds's avatar
Linus Torvalds committed
2132 2133
 * @mapping:	The address_space to search
 * @start:	The starting page index
2134
 * @end:	The final page index (inclusive)
2135
 * @fbatch:	The batch to fill.
Linus Torvalds's avatar
Linus Torvalds committed
2136
 *
2137 2138 2139
 * Search for and return a batch of folios in the mapping starting at
 * index @start and up to index @end (inclusive).  The folios are returned
 * in @fbatch with an elevated reference count.
Linus Torvalds's avatar
Linus Torvalds committed
2140
 *
2141 2142
 * Return: The number of folios which were found.
 * We also update @start to index the next folio for the traversal.
Linus Torvalds's avatar
Linus Torvalds committed
2143
 */
2144 2145
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
		pgoff_t end, struct folio_batch *fbatch)
Linus Torvalds's avatar
Linus Torvalds committed
2146
{
2147
	return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
2148 2149 2150
}
EXPORT_SYMBOL(filemap_get_folios);

2151
/**
2152
 * filemap_get_folios_contig - Get a batch of contiguous folios
2153
 * @mapping:	The address_space to search
2154 2155 2156
 * @start:	The starting page index
 * @end:	The final page index (inclusive)
 * @fbatch:	The batch to fill
2157
 *
2158 2159 2160
 * filemap_get_folios_contig() works exactly like filemap_get_folios(),
 * except the returned folios are guaranteed to be contiguous. This may
 * not return all contiguous folios if the batch gets filled up.
2161
 *
2162 2163
 * Return: The number of folios found.
 * Also update @start to be positioned for traversal of the next folio.
2164
 */
2165 2166 2167

unsigned filemap_get_folios_contig(struct address_space *mapping,
		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
2168
{
2169 2170
	XA_STATE(xas, &mapping->i_pages, *start);
	unsigned long nr;
2171
	struct folio *folio;
Nick Piggin's avatar
Nick Piggin committed
2172 2173

	rcu_read_lock();
2174 2175 2176

	for (folio = xas_load(&xas); folio && xas.xa_index <= end;
			folio = xas_next(&xas)) {
2177
		if (xas_retry(&xas, folio))
2178 2179 2180 2181 2182
			continue;
		/*
		 * If the entry has been swapped out, we can stop looking.
		 * No current caller is looking for DAX entries.
		 */
2183
		if (xa_is_value(folio))
2184
			goto update_start;
2185

2186
		if (!folio_try_get(folio))
2187
			goto retry;
2188

2189
		if (unlikely(folio != xas_reload(&xas)))
2190
			goto put_folio;
Nick Piggin's avatar
Nick Piggin committed
2191

2192 2193 2194 2195
		if (!folio_batch_add(fbatch, folio)) {
			nr = folio_nr_pages(folio);
			*start = folio->index + nr;
			goto out;
2196
		}
2197
		continue;
2198
put_folio:
2199
		folio_put(folio);
2200

2201 2202
retry:
		xas_reset(&xas);
2203
	}
2204 2205 2206 2207 2208 2209

update_start:
	nr = folio_batch_count(fbatch);

	if (nr) {
		folio = fbatch->folios[nr - 1];
2210
		*start = folio_next_index(folio);
2211 2212
	}
out:
Nick Piggin's avatar
Nick Piggin committed
2213
	rcu_read_unlock();
2214
	return folio_batch_count(fbatch);
2215
}
2216
EXPORT_SYMBOL(filemap_get_folios_contig);
2217

2218
/**
2219 2220 2221 2222 2223 2224
 * filemap_get_folios_tag - Get a batch of folios matching @tag
 * @mapping:    The address_space to search
 * @start:      The starting page index
 * @end:        The final page index (inclusive)
 * @tag:        The tag index
 * @fbatch:     The batch to fill
2225
 *
2226 2227 2228 2229 2230 2231 2232
 * The first folio may start before @start; if it does, it will contain
 * @start.  The final folio may extend beyond @end; if it does, it will
 * contain @end.  The folios have ascending indices.  There may be gaps
 * between the folios if there are indices which have no folio in the
 * page cache.  If folios are added to or removed from the page cache
 * while this is running, they may or may not be found by this call.
 * Only returns folios that are tagged with @tag.
2233
 *
2234 2235
 * Return: The number of folios found.
 * Also update @start to index the next folio for traversal.
Linus Torvalds's avatar
Linus Torvalds committed
2236
 */
2237 2238
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
			pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
Linus Torvalds's avatar
Linus Torvalds committed
2239
{
2240
	XA_STATE(xas, &mapping->i_pages, *start);
2241
	struct folio *folio;
Nick Piggin's avatar
Nick Piggin committed
2242 2243

	rcu_read_lock();
2244
	while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
2245 2246 2247
		/*
		 * Shadow entries should never be tagged, but this iteration
		 * is lockless so there is a window for page reclaim to evict
2248
		 * a page we saw tagged. Skip over it.
2249
		 */
2250
		if (xa_is_value(folio))
2251
			continue;
2252 2253 2254
		if (!folio_batch_add(fbatch, folio)) {
			unsigned long nr = folio_nr_pages(folio);
			*start = folio->index + nr;
2255 2256
			goto out;
		}
Nick Piggin's avatar
Nick Piggin committed
2257
	}
2258
	/*
2259 2260 2261 2262
	 * We come here when there is no page beyond @end. We take care to not
	 * overflow the index @start as it confuses some of the callers. This
	 * breaks the iteration when there is a page at index -1 but that is
	 * already broke anyway.
2263 2264
	 */
	if (end == (pgoff_t)-1)
2265
		*start = (pgoff_t)-1;
2266
	else
2267
		*start = end + 1;
2268
out:
Nick Piggin's avatar
Nick Piggin committed
2269
	rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
2270

2271
	return folio_batch_count(fbatch);
Linus Torvalds's avatar
Linus Torvalds committed
2272
}
2273
EXPORT_SYMBOL(filemap_get_folios_tag);
Linus Torvalds's avatar
Linus Torvalds committed
2274

2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289
/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
2290
static void shrink_readahead_size_eio(struct file_ra_state *ra)
2291 2292 2293 2294
{
	ra->ra_pages /= 4;
}

2295
/*
2296
 * filemap_get_read_batch - Get a batch of folios for read
2297
 *
2298 2299 2300 2301 2302
 * Get a batch of folios which represent a contiguous range of bytes in
 * the file.  No exceptional entries will be returned.  If @index is in
 * the middle of a folio, the entire folio will be returned.  The last
 * folio in the batch may have the readahead flag set or the uptodate flag
 * clear so that the caller can take the appropriate action.
2303 2304
 */
static void filemap_get_read_batch(struct address_space *mapping,
2305
		pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
2306 2307
{
	XA_STATE(xas, &mapping->i_pages, index);
2308
	struct folio *folio;
2309 2310

	rcu_read_lock();
2311 2312
	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
		if (xas_retry(&xas, folio))
2313
			continue;
2314
		if (xas.xa_index > max || xa_is_value(folio))
2315
			break;
2316 2317
		if (xa_is_sibling(folio))
			break;
2318
		if (!folio_try_get(folio))
2319 2320
			goto retry;

2321
		if (unlikely(folio != xas_reload(&xas)))
2322
			goto put_folio;
2323

2324
		if (!folio_batch_add(fbatch, folio))
2325
			break;
2326
		if (!folio_test_uptodate(folio))
2327
			break;
2328
		if (folio_test_readahead(folio))
2329
			break;
2330
		xas_advance(&xas, folio_next_index(folio) - 1);
2331
		continue;
2332
put_folio:
2333
		folio_put(folio);
2334 2335 2336 2337 2338 2339
retry:
		xas_reset(&xas);
	}
	rcu_read_unlock();
}

2340
static int filemap_read_folio(struct file *file, filler_t filler,
2341
		struct folio *folio)
2342
{
2343 2344
	bool workingset = folio_test_workingset(folio);
	unsigned long pflags;
2345 2346 2347
	int error;

	/*
2348
	 * A previous I/O error may have been due to temporary failures,
2349
	 * eg. multipath errors.  PG_error will be set again if read_folio
2350
	 * fails.
2351
	 */
2352
	folio_clear_error(folio);
2353

2354
	/* Start the actual read. The read will unlock the page. */
2355 2356
	if (unlikely(workingset))
		psi_memstall_enter(&pflags);
2357
	error = filler(file, folio);
2358 2359
	if (unlikely(workingset))
		psi_memstall_leave(&pflags);
2360 2361
	if (error)
		return error;
2362

2363
	error = folio_wait_locked_killable(folio);
2364 2365
	if (error)
		return error;
2366
	if (folio_test_uptodate(folio))
2367
		return 0;
2368 2369
	if (file)
		shrink_readahead_size_eio(&file->f_ra);
2370
	return -EIO;
2371 2372
}

2373
static bool filemap_range_uptodate(struct address_space *mapping,
2374 2375
		loff_t pos, size_t count, struct folio *folio,
		bool need_uptodate)
2376
{
2377
	if (folio_test_uptodate(folio))
2378 2379
		return true;
	/* pipes can't handle partially uptodate pages */
2380
	if (need_uptodate)
2381 2382 2383
		return false;
	if (!mapping->a_ops->is_partially_uptodate)
		return false;
2384
	if (mapping->host->i_blkbits >= folio_shift(folio))
2385 2386
		return false;

2387 2388
	if (folio_pos(folio) > pos) {
		count -= folio_pos(folio) - pos;
2389 2390
		pos = 0;
	} else {
2391
		pos -= folio_pos(folio);
2392 2393
	}

2394
	return mapping->a_ops->is_partially_uptodate(folio, pos, count);
2395 2396
}

2397
static int filemap_update_page(struct kiocb *iocb,
2398 2399
		struct address_space *mapping, size_t count,
		struct folio *folio, bool need_uptodate)
2400 2401 2402
{
	int error;

2403 2404 2405 2406 2407 2408 2409
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!filemap_invalidate_trylock_shared(mapping))
			return -EAGAIN;
	} else {
		filemap_invalidate_lock_shared(mapping);
	}

2410
	if (!folio_trylock(folio)) {
2411
		error = -EAGAIN;
2412
		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
2413
			goto unlock_mapping;
2414
		if (!(iocb->ki_flags & IOCB_WAITQ)) {
2415
			filemap_invalidate_unlock_shared(mapping);
2416 2417 2418 2419 2420
			/*
			 * This is where we usually end up waiting for a
			 * previously submitted readahead to finish.
			 */
			folio_put_wait_locked(folio, TASK_KILLABLE);
2421
			return AOP_TRUNCATED_PAGE;
2422
		}
2423
		error = __folio_lock_async(folio, iocb->ki_waitq);
2424
		if (error)
2425
			goto unlock_mapping;
2426 2427
	}

2428
	error = AOP_TRUNCATED_PAGE;
2429
	if (!folio->mapping)
2430
		goto unlock;
2431

2432
	error = 0;
2433 2434
	if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
				   need_uptodate))
2435 2436 2437 2438 2439 2440
		goto unlock;

	error = -EAGAIN;
	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
		goto unlock;

2441 2442
	error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
			folio);
2443
	goto unlock_mapping;
2444
unlock:
2445
	folio_unlock(folio);
2446 2447 2448
unlock_mapping:
	filemap_invalidate_unlock_shared(mapping);
	if (error == AOP_TRUNCATED_PAGE)
2449
		folio_put(folio);
2450
	return error;
2451 2452
}

2453
static int filemap_create_folio(struct file *file,
2454
		struct address_space *mapping, loff_t pos,
2455
		struct folio_batch *fbatch)
2456
{
2457
	struct folio *folio;
2458
	int error;
2459 2460
	unsigned int min_order = mapping_min_folio_order(mapping);
	pgoff_t index;
2461

2462
	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
2463
	if (!folio)
2464
		return -ENOMEM;
2465

2466
	/*
2467 2468 2469 2470 2471 2472 2473 2474
	 * Protect against truncate / hole punch. Grabbing invalidate_lock
	 * here assures we cannot instantiate and bring uptodate new
	 * pagecache folios after evicting page cache during truncate
	 * and before actually freeing blocks.	Note that we could
	 * release invalidate_lock after inserting the folio into
	 * the page cache as the locked folio would then be enough to
	 * synchronize with hole punching. But there are code paths
	 * such as filemap_update_page() filling in partially uptodate
2475
	 * pages or ->readahead() that need to hold invalidate_lock
2476 2477
	 * while mapping blocks for IO so let's hold the lock here as
	 * well to keep locking rules simple.
2478 2479
	 */
	filemap_invalidate_lock_shared(mapping);
2480
	index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
2481
	error = filemap_add_folio(mapping, folio, index,
2482 2483 2484 2485 2486 2487
			mapping_gfp_constraint(mapping, GFP_KERNEL));
	if (error == -EEXIST)
		error = AOP_TRUNCATED_PAGE;
	if (error)
		goto error;

2488
	error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
2489 2490 2491
	if (error)
		goto error;

2492
	filemap_invalidate_unlock_shared(mapping);
2493
	folio_batch_add(fbatch, folio);
2494 2495
	return 0;
error:
2496
	filemap_invalidate_unlock_shared(mapping);
2497
	folio_put(folio);
2498
	return error;
2499 2500
}

2501
static int filemap_readahead(struct kiocb *iocb, struct file *file,
2502
		struct address_space *mapping, struct folio *folio,
2503 2504
		pgoff_t last_index)
{
2505 2506
	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);

2507 2508
	if (iocb->ki_flags & IOCB_NOIO)
		return -EAGAIN;
2509
	page_cache_async_ra(&ractl, folio, last_index - folio->index);
2510 2511 2512
	return 0;
}

2513 2514
static int filemap_get_pages(struct kiocb *iocb, size_t count,
		struct folio_batch *fbatch, bool need_uptodate)
2515 2516 2517 2518 2519
{
	struct file *filp = iocb->ki_filp;
	struct address_space *mapping = filp->f_mapping;
	struct file_ra_state *ra = &filp->f_ra;
	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
2520
	pgoff_t last_index;
2521
	struct folio *folio;
2522
	int err = 0;
2523

2524
	/* "last_index" is the index of the page beyond the end of the read */
2525
	last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
2526
retry:
2527 2528 2529
	if (fatal_signal_pending(current))
		return -EINTR;

2530
	filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2531
	if (!folio_batch_count(fbatch)) {
2532 2533 2534 2535
		if (iocb->ki_flags & IOCB_NOIO)
			return -EAGAIN;
		page_cache_sync_readahead(mapping, ra, filp, index,
				last_index - index);
2536
		filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2537
	}
2538
	if (!folio_batch_count(fbatch)) {
2539 2540
		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
			return -EAGAIN;
2541
		err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
2542
		if (err == AOP_TRUNCATED_PAGE)
2543
			goto retry;
2544 2545
		return err;
	}
2546

2547
	folio = fbatch->folios[folio_batch_count(fbatch) - 1];
2548 2549
	if (folio_test_readahead(folio)) {
		err = filemap_readahead(iocb, filp, mapping, folio, last_index);
2550 2551 2552
		if (err)
			goto err;
	}
2553
	if (!folio_test_uptodate(folio)) {
2554 2555
		if ((iocb->ki_flags & IOCB_WAITQ) &&
		    folio_batch_count(fbatch) > 1)
2556
			iocb->ki_flags |= IOCB_NOWAIT;
2557 2558
		err = filemap_update_page(iocb, mapping, count, folio,
					  need_uptodate);
2559 2560
		if (err)
			goto err;
2561 2562
	}

2563
	return 0;
2564
err:
2565
	if (err < 0)
2566
		folio_put(folio);
2567
	if (likely(--fbatch->nr))
2568
		return 0;
2569
	if (err == AOP_TRUNCATED_PAGE)
2570 2571
		goto retry;
	return err;
2572 2573
}

2574 2575 2576 2577 2578 2579 2580
static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
{
	unsigned int shift = folio_shift(folio);

	return (pos1 >> shift == pos2 >> shift);
}

2581
/**
2582 2583 2584 2585
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
2586
 *
2587
 * Copies data from the page cache.  If the data is not currently present,
2588
 * uses the readahead and read_folio address_space operations to fetch it.
Linus Torvalds's avatar
Linus Torvalds committed
2589
 *
2590 2591 2592
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
Linus Torvalds's avatar
Linus Torvalds committed
2593
 */
2594 2595
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
		ssize_t already_read)
Linus Torvalds's avatar
Linus Torvalds committed
2596
{
2597
	struct file *filp = iocb->ki_filp;
2598
	struct file_ra_state *ra = &filp->f_ra;
2599
	struct address_space *mapping = filp->f_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
2600
	struct inode *inode = mapping->host;
2601
	struct folio_batch fbatch;
2602
	int i, error = 0;
2603 2604
	bool writably_mapped;
	loff_t isize, end_offset;
2605
	loff_t last_pos = ra->prev_pos;
Linus Torvalds's avatar
Linus Torvalds committed
2606

2607
	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
2608
		return 0;
2609 2610 2611
	if (unlikely(!iov_iter_count(iter)))
		return 0;

2612
	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
2613
	folio_batch_init(&fbatch);
2614

2615
	do {
Linus Torvalds's avatar
Linus Torvalds committed
2616
		cond_resched();
2617

2618
		/*
2619 2620 2621
		 * If we've already successfully copied some data, then we
		 * can no longer safely return -EIOCBQUEUED. Hence mark
		 * an async read NOWAIT at that point.
2622
		 */
2623
		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
2624 2625
			iocb->ki_flags |= IOCB_NOWAIT;

2626 2627 2628
		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
			break;

David Howells's avatar
David Howells committed
2629
		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
2630
		if (error < 0)
2631
			break;
Linus Torvalds's avatar
Linus Torvalds committed
2632

2633 2634 2635 2636 2637 2638 2639 2640 2641 2642
		/*
		 * i_size must be checked after we know the pages are Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */
		isize = i_size_read(inode);
		if (unlikely(iocb->ki_pos >= isize))
2643
			goto put_folios;
2644 2645 2646 2647 2648 2649 2650 2651 2652
		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

		/*
		 * Once we start copying data, we don't want to be touching any
		 * cachelines that might be contended:
		 */
		writably_mapped = mapping_writably_mapped(mapping);

		/*
2653
		 * When a read accesses the same folio several times, only
2654 2655
		 * mark it as accessed the first time.
		 */
2656 2657
		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
				    fbatch.folios[0]))
2658
			folio_mark_accessed(fbatch.folios[0]);
2659

2660 2661
		for (i = 0; i < folio_batch_count(&fbatch); i++) {
			struct folio *folio = fbatch.folios[i];
2662 2663
			size_t fsize = folio_size(folio);
			size_t offset = iocb->ki_pos & (fsize - 1);
2664
			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
2665
					     fsize - offset);
2666
			size_t copied;
2667

2668
			if (end_offset < folio_pos(folio))
2669 2670
				break;
			if (i > 0)
2671
				folio_mark_accessed(folio);
2672
			/*
2673 2674 2675
			 * If users can be writing to this folio using arbitrary
			 * virtual addresses, take care of potential aliasing
			 * before reading the folio on the kernel side.
2676
			 */
2677 2678
			if (writably_mapped)
				flush_dcache_folio(folio);
2679

2680
			copied = copy_folio_to_iter(folio, offset, bytes, iter);
2681

2682
			already_read += copied;
2683
			iocb->ki_pos += copied;
2684
			last_pos = iocb->ki_pos;
2685 2686 2687 2688 2689

			if (copied < bytes) {
				error = -EFAULT;
				break;
			}
Linus Torvalds's avatar
Linus Torvalds committed
2690
		}
2691 2692 2693 2694
put_folios:
		for (i = 0; i < folio_batch_count(&fbatch); i++)
			folio_put(fbatch.folios[i]);
		folio_batch_init(&fbatch);
2695
	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
Linus Torvalds's avatar
Linus Torvalds committed
2696

2697
	file_accessed(filp);
2698
	ra->prev_pos = last_pos;
2699
	return already_read ? already_read : error;
Linus Torvalds's avatar
Linus Torvalds committed
2700
}
2701
EXPORT_SYMBOL_GPL(filemap_read);
Linus Torvalds's avatar
Linus Torvalds committed
2702

2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716
int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	loff_t pos = iocb->ki_pos;
	loff_t end = pos + count - 1;

	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (filemap_range_needs_writeback(mapping, pos, end))
			return -EAGAIN;
		return 0;
	}

	return filemap_write_and_wait_range(mapping, pos, end);
}
2717
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
2718

2719 2720
int filemap_invalidate_pages(struct address_space *mapping,
			     loff_t pos, loff_t end, bool nowait)
2721 2722 2723
{
	int ret;

2724
	if (nowait) {
2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742
		/* we could block if there are any pages in the range */
		if (filemap_range_has_page(mapping, pos, end))
			return -EAGAIN;
	} else {
		ret = filemap_write_and_wait_range(mapping, pos, end);
		if (ret)
			return ret;
	}

	/*
	 * After a write we want buffered reads to be sure to go to disk to get
	 * the new data.  We invalidate clean cached page from the region we're
	 * about to write.  We do this *before* the write so that we can return
	 * without clobbering -EIOCBQUEUED from ->direct_IO().
	 */
	return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
					     end >> PAGE_SHIFT);
}
2743 2744 2745 2746 2747 2748 2749 2750 2751

int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;

	return filemap_invalidate_pages(mapping, iocb->ki_pos,
					iocb->ki_pos + count - 1,
					iocb->ki_flags & IOCB_NOWAIT);
}
2752
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
2753

2754
/**
Al Viro's avatar
Al Viro committed
2755
 * generic_file_read_iter - generic filesystem read routine
2756
 * @iocb:	kernel I/O control block
Al Viro's avatar
Al Viro committed
2757
 * @iter:	destination for the data read
2758
 *
Al Viro's avatar
Al Viro committed
2759
 * This is the "read_iter()" routine for all filesystems
Linus Torvalds's avatar
Linus Torvalds committed
2760
 * that can use the page cache directly.
2761 2762 2763 2764 2765 2766 2767 2768 2769 2770
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
2771 2772
 * Return:
 * * number of bytes copied, even for partial reads
2773
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
Linus Torvalds's avatar
Linus Torvalds committed
2774 2775
 */
ssize_t
2776
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
Linus Torvalds's avatar
Linus Torvalds committed
2777
{
2778
	size_t count = iov_iter_count(iter);
2779
	ssize_t retval = 0;
2780 2781

	if (!count)
2782
		return 0; /* skip atime */
Linus Torvalds's avatar
Linus Torvalds committed
2783

2784
	if (iocb->ki_flags & IOCB_DIRECT) {
2785
		struct file *file = iocb->ki_filp;
2786 2787
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
2788

2789 2790 2791
		retval = kiocb_write_and_wait(iocb, count);
		if (retval < 0)
			return retval;
2792 2793
		file_accessed(file);

2794
		retval = mapping->a_ops->direct_IO(iocb, iter);
2795
		if (retval >= 0) {
2796
			iocb->ki_pos += retval;
2797
			count -= retval;
2798
		}
2799 2800
		if (retval != -EIOCBQUEUED)
			iov_iter_revert(iter, count - iov_iter_count(iter));
2801

2802 2803 2804 2805 2806 2807
		/*
		 * Btrfs can have a short DIO read if we encounter
		 * compressed extents, so if there was an error, or if
		 * we've already read everything we wanted to, or if
		 * there was a short read because we hit EOF, go ahead
		 * and return.  Otherwise fallthrough to buffered io for
2808 2809
		 * the rest of the read.  Buffered reads will not work for
		 * DAX files, so don't bother trying.
2810
		 */
2811 2812 2813
		if (retval < 0 || !count || IS_DAX(inode))
			return retval;
		if (iocb->ki_pos >= i_size_read(inode))
2814
			return retval;
Linus Torvalds's avatar
Linus Torvalds committed
2815 2816
	}

2817
	return filemap_read(iocb, iter, retval);
Linus Torvalds's avatar
Linus Torvalds committed
2818
}
2819
EXPORT_SYMBOL(generic_file_read_iter);
Linus Torvalds's avatar
Linus Torvalds committed
2820

2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854
/*
 * Splice subpages from a folio into a pipe.
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
			      struct folio *folio, loff_t fpos, size_t size)
{
	struct page *page;
	size_t spliced = 0, offset = offset_in_folio(folio, fpos);

	page = folio_page(folio, offset / PAGE_SIZE);
	size = min(size, folio_size(folio) - offset);
	offset %= PAGE_SIZE;

	while (spliced < size &&
	       !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
		struct pipe_buffer *buf = pipe_head_buf(pipe);
		size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);

		*buf = (struct pipe_buffer) {
			.ops	= &page_cache_pipe_buf_ops,
			.page	= page,
			.offset	= offset,
			.len	= part,
		};
		folio_get(folio);
		pipe->head++;
		page++;
		spliced += part;
		offset = 0;
	}

	return spliced;
}

2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872
/**
 * filemap_splice_read -  Splice data from a file's pagecache into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function gets folios from a file's pagecache and splices them into the
 * pipe.  Readahead will be called as necessary to fill more folios.  This may
 * be used for blockdevs also.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884
 */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
			    struct pipe_inode_info *pipe,
			    size_t len, unsigned int flags)
{
	struct folio_batch fbatch;
	struct kiocb iocb;
	size_t total_spliced = 0, used, npages;
	loff_t isize, end_offset;
	bool writably_mapped;
	int i, error = 0;

2885 2886 2887
	if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
		return 0;

2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900
	init_sync_kiocb(&iocb, in);
	iocb.ki_pos = *ppos;

	/* Work out how much data we can actually add into the pipe */
	used = pipe_occupancy(pipe->head, pipe->tail);
	npages = max_t(ssize_t, pipe->max_usage - used, 0);
	len = min_t(size_t, len, npages * PAGE_SIZE);

	folio_batch_init(&fbatch);

	do {
		cond_resched();

2901
		if (*ppos >= i_size_read(in->f_mapping->host))
2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916
			break;

		iocb.ki_pos = *ppos;
		error = filemap_get_pages(&iocb, len, &fbatch, true);
		if (error < 0)
			break;

		/*
		 * i_size must be checked after we know the pages are Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */
2917
		isize = i_size_read(in->f_mapping->host);
2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964
		if (unlikely(*ppos >= isize))
			break;
		end_offset = min_t(loff_t, isize, *ppos + len);

		/*
		 * Once we start copying data, we don't want to be touching any
		 * cachelines that might be contended:
		 */
		writably_mapped = mapping_writably_mapped(in->f_mapping);

		for (i = 0; i < folio_batch_count(&fbatch); i++) {
			struct folio *folio = fbatch.folios[i];
			size_t n;

			if (folio_pos(folio) >= end_offset)
				goto out;
			folio_mark_accessed(folio);

			/*
			 * If users can be writing to this folio using arbitrary
			 * virtual addresses, take care of potential aliasing
			 * before reading the folio on the kernel side.
			 */
			if (writably_mapped)
				flush_dcache_folio(folio);

			n = min_t(loff_t, len, isize - *ppos);
			n = splice_folio_into_pipe(pipe, folio, *ppos, n);
			if (!n)
				goto out;
			len -= n;
			total_spliced += n;
			*ppos += n;
			in->f_ra.prev_pos = *ppos;
			if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
				goto out;
		}

		folio_batch_release(&fbatch);
	} while (len);

out:
	folio_batch_release(&fbatch);
	file_accessed(in);

	return total_spliced ? total_spliced : error;
}
2965
EXPORT_SYMBOL(filemap_splice_read);
2966

2967 2968
static inline loff_t folio_seek_hole_data(struct xa_state *xas,
		struct address_space *mapping, struct folio *folio,
2969
		loff_t start, loff_t end, bool seek_data)
2970
{
2971 2972 2973
	const struct address_space_operations *ops = mapping->a_ops;
	size_t offset, bsz = i_blocksize(mapping->host);

2974
	if (xa_is_value(folio) || folio_test_uptodate(folio))
2975 2976 2977 2978 2979 2980
		return seek_data ? start : end;
	if (!ops->is_partially_uptodate)
		return seek_data ? end : start;

	xas_pause(xas);
	rcu_read_unlock();
2981 2982
	folio_lock(folio);
	if (unlikely(folio->mapping != mapping))
2983 2984
		goto unlock;

2985
	offset = offset_in_folio(folio, start) & ~(bsz - 1);
2986 2987

	do {
2988
		if (ops->is_partially_uptodate(folio, offset, bsz) ==
2989
							seek_data)
2990 2991 2992
			break;
		start = (start + bsz) & ~(bsz - 1);
		offset += bsz;
2993
	} while (offset < folio_size(folio));
2994
unlock:
2995
	folio_unlock(folio);
2996 2997
	rcu_read_lock();
	return start;
2998 2999
}

3000
static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
3001
{
3002
	if (xa_is_value(folio))
3003
		return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
3004
	return folio_size(folio);
3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019
}

/**
 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
 * @mapping: Address space to search.
 * @start: First byte to consider.
 * @end: Limit of search (exclusive).
 * @whence: Either SEEK_HOLE or SEEK_DATA.
 *
 * If the page cache knows which blocks contain holes and which blocks
 * contain data, your filesystem can use this function to implement
 * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
 * entirely memory-based such as tmpfs, and filesystems which support
 * unwritten extents.
 *
Ingo Molnar's avatar
Ingo Molnar committed
3020
 * Return: The requested offset on success, or -ENXIO if @whence specifies
3021 3022 3023 3024 3025 3026 3027 3028
 * SEEK_DATA and there is no data after @start.  There is an implicit hole
 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
 * and @end contain data.
 */
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
		loff_t end, int whence)
{
	XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
3029
	pgoff_t max = (end - 1) >> PAGE_SHIFT;
3030
	bool seek_data = (whence == SEEK_DATA);
3031
	struct folio *folio;
3032 3033 3034 3035 3036

	if (end <= start)
		return -ENXIO;

	rcu_read_lock();
3037
	while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
3038
		loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
3039
		size_t seek_size;
3040 3041 3042 3043 3044 3045 3046

		if (start < pos) {
			if (!seek_data)
				goto unlock;
			start = pos;
		}

3047 3048 3049
		seek_size = seek_folio_size(&xas, folio);
		pos = round_up((u64)pos + 1, seek_size);
		start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
3050 3051
				seek_data);
		if (start < pos)
3052
			goto unlock;
3053 3054 3055 3056
		if (start >= end)
			break;
		if (seek_size > PAGE_SIZE)
			xas_set(&xas, pos >> PAGE_SHIFT);
3057 3058
		if (!xa_is_value(folio))
			folio_put(folio);
3059 3060
	}
	if (seek_data)
3061
		start = -ENXIO;
3062 3063
unlock:
	rcu_read_unlock();
3064 3065
	if (folio && !xa_is_value(folio))
		folio_put(folio);
3066 3067 3068 3069 3070
	if (start > end)
		return end;
	return start;
}

Linus Torvalds's avatar
Linus Torvalds committed
3071 3072
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS  (100)
3073
/*
3074
 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
3075
 * @vmf - the vm_fault for this fault.
3076
 * @folio - the folio to lock.
3077 3078
 * @fpin - the pointer to the file we may pin (or is already pinned).
 *
3079 3080 3081 3082 3083
 * This works similar to lock_folio_or_retry in that it can drop the
 * mmap_lock.  It differs in that it actually returns the folio locked
 * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
 * to drop the mmap_lock then fpin will point to the pinned file and
 * needs to be fput()'ed at a later point.
3084
 */
3085
static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
3086 3087
				     struct file **fpin)
{
3088
	if (folio_trylock(folio))
3089 3090
		return 1;

3091 3092
	/*
	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
3093
	 * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
3094 3095
	 * is supposed to work. We have way too many special cases..
	 */
3096 3097 3098 3099 3100
	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
		return 0;

	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
	if (vmf->flags & FAULT_FLAG_KILLABLE) {
3101
		if (__folio_lock_killable(folio)) {
3102
			/*
3103 3104 3105 3106 3107
			 * We didn't have the right flags to drop the
			 * fault lock, but all fault_handlers only check
			 * for fatal signals if we return VM_FAULT_RETRY,
			 * so we need to drop the fault lock here and
			 * return 0 if we don't have a fpin.
3108 3109
			 */
			if (*fpin == NULL)
3110
				release_fault_lock(vmf);
3111 3112 3113
			return 0;
		}
	} else
3114 3115
		__folio_lock(folio);

3116 3117 3118
	return 1;
}

3119
/*
3120 3121 3122 3123 3124
 * Synchronous readahead happens when we don't even find a page in the page
 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
 * to drop the mmap sem we return the file that was pinned in order for us to do
 * that.  If we didn't pin a file then we return NULL.  The file that is
 * returned needs to be fput()'ed when we're done with it.
3125
 */
3126
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
3127
{
3128 3129
	struct file *file = vmf->vma->vm_file;
	struct file_ra_state *ra = &file->f_ra;
3130
	struct address_space *mapping = file->f_mapping;
3131
	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
3132
	struct file *fpin = NULL;
3133
	unsigned long vm_flags = vmf->vma->vm_flags;
3134
	unsigned int mmap_miss;
3135

3136 3137
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	/* Use the readahead code, even if readahead is disabled */
3138
	if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
3139 3140 3141 3142 3143 3144 3145
		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
		ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
		ra->size = HPAGE_PMD_NR;
		/*
		 * Fetch two PMD folios, so we get the chance to actually
		 * readahead, unless we've been told not to.
		 */
3146
		if (!(vm_flags & VM_RAND_READ))
3147 3148 3149 3150 3151 3152 3153
			ra->size *= 2;
		ra->async_size = HPAGE_PMD_NR;
		page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
		return fpin;
	}
#endif

3154
	/* If we don't want any read-ahead, don't bother */
3155
	if (vm_flags & VM_RAND_READ)
3156
		return fpin;
3157
	if (!ra->ra_pages)
3158
		return fpin;
3159

3160
	if (vm_flags & VM_SEQ_READ) {
3161
		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3162
		page_cache_sync_ra(&ractl, ra->ra_pages);
3163
		return fpin;
3164 3165
	}

3166
	/* Avoid banging the cache line if not needed */
3167 3168 3169
	mmap_miss = READ_ONCE(ra->mmap_miss);
	if (mmap_miss < MMAP_LOTSAMISS * 10)
		WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
3170 3171 3172 3173 3174

	/*
	 * Do we miss much more than hit in this file? If so,
	 * stop bothering with read-ahead. It will only hurt.
	 */
3175
	if (mmap_miss > MMAP_LOTSAMISS)
3176
		return fpin;
3177

3178 3179 3180
	/*
	 * mmap read-around
	 */
3181
	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3182
	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
3183 3184
	ra->size = ra->ra_pages;
	ra->async_size = ra->ra_pages / 4;
3185
	ractl._index = ra->start;
3186
	page_cache_ra_order(&ractl, ra, 0);
3187
	return fpin;
3188 3189 3190 3191
}

/*
 * Asynchronous readahead happens when we find the page and PG_readahead,
3192
 * so we want to possibly extend the readahead further.  We return the file that
3193
 * was pinned if we have to drop the mmap_lock in order to do IO.
3194
 */
3195
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
3196
					    struct folio *folio)
3197
{
3198 3199
	struct file *file = vmf->vma->vm_file;
	struct file_ra_state *ra = &file->f_ra;
3200
	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
3201
	struct file *fpin = NULL;
3202
	unsigned int mmap_miss;
3203 3204

	/* If we don't want any read-ahead, don't bother */
3205
	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
3206
		return fpin;
3207

3208 3209 3210
	mmap_miss = READ_ONCE(ra->mmap_miss);
	if (mmap_miss)
		WRITE_ONCE(ra->mmap_miss, --mmap_miss);
3211 3212

	if (folio_test_readahead(folio)) {
3213
		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3214
		page_cache_async_ra(&ractl, folio, ra->ra_pages);
3215 3216
	}
	return fpin;
3217 3218
}

3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244
static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	vm_fault_t ret = 0;
	pte_t *ptep;

	/*
	 * We might have COW'ed a pagecache folio and might now have an mlocked
	 * anon folio mapped. The original pagecache folio is not mlocked and
	 * might have been evicted. During a read+clear/modify/write update of
	 * the PTE, such as done in do_numa_page()/change_pte_range(), we
	 * temporarily clear the PTE under PT lock and might detect it here as
	 * "none" when not holding the PT lock.
	 *
	 * Not rechecking the PTE under PT lock could result in an unexpected
	 * major fault in an mlock'ed region. Recheck only for this special
	 * scenario while holding the PT lock, to not degrade non-mlocked
	 * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
	 * the number of times we hold PT lock.
	 */
	if (!(vma->vm_flags & VM_LOCKED))
		return 0;

	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
		return 0;

3245 3246
	ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
				     &vmf->ptl);
3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261
	if (unlikely(!ptep))
		return VM_FAULT_NOPAGE;

	if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
		ret = VM_FAULT_NOPAGE;
	} else {
		spin_lock(vmf->ptl);
		if (unlikely(!pte_none(ptep_get(ptep))))
			ret = VM_FAULT_NOPAGE;
		spin_unlock(vmf->ptl);
	}
	pte_unmap(ptep);
	return ret;
}

3262
/**
3263
 * filemap_fault - read in file data for page fault handling
Nick Piggin's avatar
Nick Piggin committed
3264
 * @vmf:	struct vm_fault containing details of the fault
3265
 *
3266
 * filemap_fault() is invoked via the vma operations vector for a
Linus Torvalds's avatar
Linus Torvalds committed
3267 3268 3269 3270 3271
 * mapped memory region to read in file data during a page fault.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
3272
 *
3273
 * vma->vm_mm->mmap_lock must be held on entry.
3274
 *
3275
 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
3276
 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
3277
 *
3278
 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
3279 3280 3281
 * has not been released.
 *
 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
3282 3283
 *
 * Return: bitwise-OR of %VM_FAULT_ codes.
Linus Torvalds's avatar
Linus Torvalds committed
3284
 */
3285
vm_fault_t filemap_fault(struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
3286 3287
{
	int error;
3288
	struct file *file = vmf->vma->vm_file;
3289
	struct file *fpin = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
3290 3291
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
3292 3293
	pgoff_t max_idx, index = vmf->pgoff;
	struct folio *folio;
3294
	vm_fault_t ret = 0;
3295
	bool mapping_locked = false;
Linus Torvalds's avatar
Linus Torvalds committed
3296

3297 3298
	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
	if (unlikely(index >= max_idx))
3299
		return VM_FAULT_SIGBUS;
Linus Torvalds's avatar
Linus Torvalds committed
3300 3301

	/*
3302
	 * Do we have something in the page cache already?
Linus Torvalds's avatar
Linus Torvalds committed
3303
	 */
3304
	folio = filemap_get_folio(mapping, index);
3305
	if (likely(!IS_ERR(folio))) {
Linus Torvalds's avatar
Linus Torvalds committed
3306
		/*
3307 3308
		 * We found the page, so try async readahead before waiting for
		 * the lock.
Linus Torvalds's avatar
Linus Torvalds committed
3309
		 */
3310
		if (!(vmf->flags & FAULT_FLAG_TRIED))
3311
			fpin = do_async_mmap_readahead(vmf, folio);
3312
		if (unlikely(!folio_test_uptodate(folio))) {
3313 3314 3315 3316
			filemap_invalidate_lock_shared(mapping);
			mapping_locked = true;
		}
	} else {
3317 3318 3319 3320
		ret = filemap_fault_recheck_pte_none(vmf);
		if (unlikely(ret))
			return ret;

3321 3322
		/* No page in the page cache at all */
		count_vm_event(PGMAJFAULT);
3323
		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
3324
		ret = VM_FAULT_MAJOR;
3325
		fpin = do_sync_mmap_readahead(vmf);
3326
retry_find:
3327
		/*
3328
		 * See comment in filemap_create_folio() why we need
3329 3330 3331 3332 3333 3334
		 * invalidate_lock
		 */
		if (!mapping_locked) {
			filemap_invalidate_lock_shared(mapping);
			mapping_locked = true;
		}
3335
		folio = __filemap_get_folio(mapping, index,
3336 3337
					  FGP_CREAT|FGP_FOR_MMAP,
					  vmf->gfp_mask);
3338
		if (IS_ERR(folio)) {
3339 3340
			if (fpin)
				goto out_retry;
3341
			filemap_invalidate_unlock_shared(mapping);
3342
			return VM_FAULT_OOM;
3343
		}
Linus Torvalds's avatar
Linus Torvalds committed
3344 3345
	}

3346
	if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
3347
		goto out_retry;
3348 3349

	/* Did it get truncated? */
3350 3351 3352
	if (unlikely(folio->mapping != mapping)) {
		folio_unlock(folio);
		folio_put(folio);
3353 3354
		goto retry_find;
	}
3355
	VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
3356

Linus Torvalds's avatar
Linus Torvalds committed
3357
	/*
3358 3359 3360
	 * We have a locked folio in the page cache, now we need to check
	 * that it's up-to-date. If not, it is going to be due to an error,
	 * or because readahead was otherwise unable to retrieve it.
Linus Torvalds's avatar
Linus Torvalds committed
3361
	 */
3362
	if (unlikely(!folio_test_uptodate(folio))) {
3363
		/*
3364 3365 3366 3367
		 * If the invalidate lock is not held, the folio was in cache
		 * and uptodate and now it is not. Strange but possible since we
		 * didn't hold the page lock all the time. Let's drop
		 * everything, get the invalidate lock and try again.
3368 3369
		 */
		if (!mapping_locked) {
3370 3371
			folio_unlock(folio);
			folio_put(folio);
3372 3373
			goto retry_find;
		}
3374 3375 3376 3377 3378 3379

		/*
		 * OK, the folio is really not uptodate. This can be because the
		 * VMA has the VM_RAND_READ flag set, or because an error
		 * arose. Let's read it in directly.
		 */
Linus Torvalds's avatar
Linus Torvalds committed
3380
		goto page_not_uptodate;
3381
	}
Linus Torvalds's avatar
Linus Torvalds committed
3382

3383
	/*
3384
	 * We've made it this far and we had to drop our mmap_lock, now is the
3385 3386 3387 3388
	 * time to return to the upper layer and have it re-find the vma and
	 * redo the fault.
	 */
	if (fpin) {
3389
		folio_unlock(folio);
3390 3391
		goto out_retry;
	}
3392 3393
	if (mapping_locked)
		filemap_invalidate_unlock_shared(mapping);
3394

3395 3396 3397 3398
	/*
	 * Found the page and have a reference on it.
	 * We must recheck i_size under page lock.
	 */
3399 3400 3401 3402
	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
	if (unlikely(index >= max_idx)) {
		folio_unlock(folio);
		folio_put(folio);
3403
		return VM_FAULT_SIGBUS;
3404 3405
	}

3406
	vmf->page = folio_file_page(folio, index);
Nick Piggin's avatar
Nick Piggin committed
3407
	return ret | VM_FAULT_LOCKED;
Linus Torvalds's avatar
Linus Torvalds committed
3408 3409 3410 3411 3412 3413 3414 3415

page_not_uptodate:
	/*
	 * Umm, take care of errors if the page isn't up-to-date.
	 * Try to re-read it _once_. We do this synchronously,
	 * because there really aren't any performance issues here
	 * and we need to check for errors.
	 */
3416
	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3417
	error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
3418 3419
	if (fpin)
		goto out_retry;
3420
	folio_put(folio);
3421 3422

	if (!error || error == AOP_TRUNCATED_PAGE)
3423
		goto retry_find;
3424
	filemap_invalidate_unlock_shared(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
3425

Nick Piggin's avatar
Nick Piggin committed
3426
	return VM_FAULT_SIGBUS;
3427 3428 3429

out_retry:
	/*
3430
	 * We dropped the mmap_lock, we need to return to the fault handler to
3431 3432 3433
	 * re-find the vma and come back and find our hopefully still populated
	 * page.
	 */
3434
	if (!IS_ERR(folio))
3435
		folio_put(folio);
3436 3437
	if (mapping_locked)
		filemap_invalidate_unlock_shared(mapping);
3438 3439 3440
	if (fpin)
		fput(fpin);
	return ret | VM_FAULT_RETRY;
3441 3442 3443
}
EXPORT_SYMBOL(filemap_fault);

3444 3445
static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
		pgoff_t start)
3446
{
3447 3448 3449 3450
	struct mm_struct *mm = vmf->vma->vm_mm;

	/* Huge page is mapped? No need to proceed. */
	if (pmd_trans_huge(*vmf->pmd)) {
3451 3452
		folio_unlock(folio);
		folio_put(folio);
3453 3454 3455
		return true;
	}

3456 3457
	if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
		struct page *page = folio_file_page(folio, start);
3458 3459 3460
		vm_fault_t ret = do_set_pmd(vmf, page);
		if (!ret) {
			/* The page is mapped successfully, reference consumed. */
3461
			folio_unlock(folio);
3462
			return true;
3463 3464 3465
		}
	}

3466
	if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
3467
		pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
3468 3469 3470 3471

	return false;
}

3472 3473
static struct folio *next_uptodate_folio(struct xa_state *xas,
		struct address_space *mapping, pgoff_t end_pgoff)
3474
{
3475
	struct folio *folio = xas_next_entry(xas, end_pgoff);
3476 3477 3478
	unsigned long max_idx;

	do {
3479
		if (!folio)
3480
			return NULL;
3481
		if (xas_retry(xas, folio))
3482
			continue;
3483
		if (xa_is_value(folio))
3484
			continue;
3485
		if (folio_test_locked(folio))
3486
			continue;
3487
		if (!folio_try_get(folio))
3488 3489
			continue;
		/* Has the page moved or been split? */
3490
		if (unlikely(folio != xas_reload(xas)))
3491
			goto skip;
3492
		if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
3493
			goto skip;
3494
		if (!folio_trylock(folio))
3495
			goto skip;
3496
		if (folio->mapping != mapping)
3497
			goto unlock;
3498
		if (!folio_test_uptodate(folio))
3499 3500 3501 3502
			goto unlock;
		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
		if (xas->xa_index >= max_idx)
			goto unlock;
3503
		return folio;
3504
unlock:
3505
		folio_unlock(folio);
3506
skip:
3507 3508
		folio_put(folio);
	} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
3509 3510 3511 3512

	return NULL;
}

3513 3514 3515 3516 3517 3518
/*
 * Map page range [start_page, start_page + nr_pages) of folio.
 * start_page is gotten from start by folio_page(folio, start)
 */
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
			struct folio *folio, unsigned long start,
3519
			unsigned long addr, unsigned int nr_pages,
3520
			unsigned long *rss, unsigned int *mmap_miss)
3521
{
3522 3523
	vm_fault_t ret = 0;
	struct page *page = folio_page(folio, start);
Yin Fengwei's avatar
Yin Fengwei committed
3524 3525
	unsigned int count = 0;
	pte_t *old_ptep = vmf->pte;
3526

3527
	do {
Yin Fengwei's avatar
Yin Fengwei committed
3528 3529
		if (PageHWPoison(page + count))
			goto skip;
3530

3531 3532 3533 3534 3535 3536 3537 3538 3539
		/*
		 * If there are too many folios that are recently evicted
		 * in a file, they will probably continue to be evicted.
		 * In such situation, read-ahead is only a waste of IO.
		 * Don't decrease mmap_miss in this scenario to make sure
		 * we can stop read-ahead.
		 */
		if (!folio_test_workingset(folio))
			(*mmap_miss)++;
3540 3541 3542 3543 3544 3545

		/*
		 * NOTE: If there're PTE markers, we'll leave them to be
		 * handled in the specific fault path, and it'll prohibit the
		 * fault-around logic.
		 */
3546
		if (!pte_none(ptep_get(&vmf->pte[count])))
Yin Fengwei's avatar
Yin Fengwei committed
3547
			goto skip;
3548

Yin Fengwei's avatar
Yin Fengwei committed
3549 3550 3551 3552 3553
		count++;
		continue;
skip:
		if (count) {
			set_pte_range(vmf, folio, page, count, addr);
3554
			*rss += count;
Yin Fengwei's avatar
Yin Fengwei committed
3555
			folio_ref_add(folio, count);
3556
			if (in_range(vmf->address, addr, count * PAGE_SIZE))
Yin Fengwei's avatar
Yin Fengwei committed
3557 3558
				ret = VM_FAULT_NOPAGE;
		}
3559

Yin Fengwei's avatar
Yin Fengwei committed
3560 3561 3562 3563 3564 3565 3566 3567 3568
		count++;
		page += count;
		vmf->pte += count;
		addr += count * PAGE_SIZE;
		count = 0;
	} while (--nr_pages > 0);

	if (count) {
		set_pte_range(vmf, folio, page, count, addr);
3569
		*rss += count;
Yin Fengwei's avatar
Yin Fengwei committed
3570
		folio_ref_add(folio, count);
3571
		if (in_range(vmf->address, addr, count * PAGE_SIZE))
Yin Fengwei's avatar
Yin Fengwei committed
3572 3573
			ret = VM_FAULT_NOPAGE;
	}
3574

Yin Fengwei's avatar
Yin Fengwei committed
3575
	vmf->pte = old_ptep;
3576 3577 3578 3579 3580 3581

	return ret;
}

static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
		struct folio *folio, unsigned long addr,
3582
		unsigned long *rss, unsigned int *mmap_miss)
3583 3584 3585 3586 3587 3588 3589
{
	vm_fault_t ret = 0;
	struct page *page = &folio->page;

	if (PageHWPoison(page))
		return ret;

3590 3591 3592
	/* See comment of filemap_map_folio_range() */
	if (!folio_test_workingset(folio))
		(*mmap_miss)++;
3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605

	/*
	 * NOTE: If there're PTE markers, we'll leave them to be
	 * handled in the specific fault path, and it'll prohibit
	 * the fault-around logic.
	 */
	if (!pte_none(ptep_get(vmf->pte)))
		return ret;

	if (vmf->address == addr)
		ret = VM_FAULT_NOPAGE;

	set_pte_range(vmf, folio, page, 1, addr);
3606
	(*rss)++;
3607
	folio_ref_inc(folio);
3608 3609

	return ret;
3610 3611 3612 3613 3614 3615 3616
}

vm_fault_t filemap_map_pages(struct vm_fault *vmf,
			     pgoff_t start_pgoff, pgoff_t end_pgoff)
{
	struct vm_area_struct *vma = vmf->vma;
	struct file *file = vma->vm_file;
3617
	struct address_space *mapping = file->f_mapping;
3618
	pgoff_t file_end, last_pgoff = start_pgoff;
3619
	unsigned long addr;
3620
	XA_STATE(xas, &mapping->i_pages, start_pgoff);
3621
	struct folio *folio;
3622
	vm_fault_t ret = 0;
3623 3624
	unsigned long rss = 0;
	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
3625 3626

	rcu_read_lock();
3627
	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
3628
	if (!folio)
3629
		goto out;
3630

3631
	if (filemap_map_pmd(vmf, folio, start_pgoff)) {
3632 3633 3634
		ret = VM_FAULT_NOPAGE;
		goto out;
	}
3635

3636 3637
	addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
3638 3639 3640 3641 3642
	if (!vmf->pte) {
		folio_unlock(folio);
		folio_put(folio);
		goto out;
	}
3643

3644 3645 3646 3647
	file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
	if (end_pgoff > file_end)
		end_pgoff = file_end;

3648
	folio_type = mm_counter_file(folio);
3649
	do {
3650
		unsigned long end;
3651

3652
		addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
3653
		vmf->pte += xas.xa_index - last_pgoff;
3654
		last_pgoff = xas.xa_index;
3655
		end = folio_next_index(folio) - 1;
3656
		nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
3657

3658 3659
		if (!folio_test_large(folio))
			ret |= filemap_map_order0_folio(vmf,
3660
					folio, addr, &rss, &mmap_miss);
3661 3662 3663
		else
			ret |= filemap_map_folio_range(vmf, folio,
					xas.xa_index - folio->index, addr,
3664
					nr_pages, &rss, &mmap_miss);
3665

3666 3667
		folio_unlock(folio);
		folio_put(folio);
3668
	} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
3669
	add_mm_counter(vma->vm_mm, folio_type, rss);
3670 3671
	pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
3672
	rcu_read_unlock();
3673 3674 3675 3676 3677 3678 3679

	mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
	if (mmap_miss >= mmap_miss_saved)
		WRITE_ONCE(file->f_ra.mmap_miss, 0);
	else
		WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);

3680
	return ret;
3681 3682 3683
}
EXPORT_SYMBOL(filemap_map_pages);

3684
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3685
{
3686
	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
3687
	struct folio *folio = page_folio(vmf->page);
3688
	vm_fault_t ret = VM_FAULT_LOCKED;
3689

3690
	sb_start_pagefault(mapping->host->i_sb);
3691
	file_update_time(vmf->vma->vm_file);
3692 3693 3694
	folio_lock(folio);
	if (folio->mapping != mapping) {
		folio_unlock(folio);
3695 3696 3697
		ret = VM_FAULT_NOPAGE;
		goto out;
	}
3698
	/*
3699
	 * We mark the folio dirty already here so that when freeze is in
3700
	 * progress, we are guaranteed that writeback during freezing will
3701
	 * see the dirty folio and writeprotect it again.
3702
	 */
3703 3704
	folio_mark_dirty(folio);
	folio_wait_stable(folio);
3705
out:
3706
	sb_end_pagefault(mapping->host->i_sb);
3707 3708 3709
	return ret;
}

3710
const struct vm_operations_struct generic_file_vm_ops = {
3711
	.fault		= filemap_fault,
3712
	.map_pages	= filemap_map_pages,
3713
	.page_mkwrite	= filemap_page_mkwrite,
Linus Torvalds's avatar
Linus Torvalds committed
3714 3715 3716 3717
};

/* This is used for a general mmap of a disk file */

3718
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
3719 3720 3721
{
	struct address_space *mapping = file->f_mapping;

3722
	if (!mapping->a_ops->read_folio)
Linus Torvalds's avatar
Linus Torvalds committed
3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733
		return -ENOEXEC;
	file_accessed(file);
	vma->vm_ops = &generic_file_vm_ops;
	return 0;
}

/*
 * This is for filesystems which do not implement ->writepage.
 */
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
3734
	if (vma_is_shared_maywrite(vma))
Linus Torvalds's avatar
Linus Torvalds committed
3735 3736 3737 3738
		return -EINVAL;
	return generic_file_mmap(file, vma);
}
#else
3739
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3740
{
3741
	return VM_FAULT_SIGBUS;
3742
}
3743
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
3744 3745 3746
{
	return -ENOSYS;
}
3747
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
3748 3749 3750 3751 3752
{
	return -ENOSYS;
}
#endif /* CONFIG_MMU */

3753
EXPORT_SYMBOL(filemap_page_mkwrite);
Linus Torvalds's avatar
Linus Torvalds committed
3754 3755 3756
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);

3757
static struct folio *do_read_cache_folio(struct address_space *mapping,
3758
		pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
3759
{
3760
	struct folio *folio;
Linus Torvalds's avatar
Linus Torvalds committed
3761
	int err;
3762 3763 3764

	if (!filler)
		filler = mapping->a_ops->read_folio;
Linus Torvalds's avatar
Linus Torvalds committed
3765
repeat:
3766
	folio = filemap_get_folio(mapping, index);
3767
	if (IS_ERR(folio)) {
3768 3769
		folio = filemap_alloc_folio(gfp,
					    mapping_min_folio_order(mapping));
3770
		if (!folio)
Nick Piggin's avatar
Nick Piggin committed
3771
			return ERR_PTR(-ENOMEM);
3772
		index = mapping_align_index(mapping, index);
3773
		err = filemap_add_folio(mapping, folio, index, gfp);
Nick Piggin's avatar
Nick Piggin committed
3774
		if (unlikely(err)) {
3775
			folio_put(folio);
Nick Piggin's avatar
Nick Piggin committed
3776 3777
			if (err == -EEXIST)
				goto repeat;
3778
			/* Presumably ENOMEM for xarray node */
Linus Torvalds's avatar
Linus Torvalds committed
3779 3780
			return ERR_PTR(err);
		}
3781

3782
		goto filler;
3783
	}
3784
	if (folio_test_uptodate(folio))
Linus Torvalds's avatar
Linus Torvalds committed
3785 3786
		goto out;

3787 3788 3789 3790
	if (!folio_trylock(folio)) {
		folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
		goto repeat;
	}
3791

3792
	/* Folio was truncated from mapping */
3793 3794 3795
	if (!folio->mapping) {
		folio_unlock(folio);
		folio_put(folio);
3796
		goto repeat;
Linus Torvalds's avatar
Linus Torvalds committed
3797
	}
3798 3799

	/* Someone else locked and filled the page in a very small window */
3800 3801
	if (folio_test_uptodate(folio)) {
		folio_unlock(folio);
Linus Torvalds's avatar
Linus Torvalds committed
3802 3803
		goto out;
	}
3804

3805
filler:
3806
	err = filemap_read_folio(file, filler, folio);
3807
	if (err) {
3808
		folio_put(folio);
3809 3810
		if (err == AOP_TRUNCATED_PAGE)
			goto repeat;
3811 3812
		return ERR_PTR(err);
	}
3813

3814
out:
3815 3816
	folio_mark_accessed(folio);
	return folio;
3817
}
3818 3819

/**
3820 3821 3822 3823 3824
 * read_cache_folio - Read into page cache, fill it if needed.
 * @mapping: The address_space to read from.
 * @index: The index to read.
 * @filler: Function to perform the read, or NULL to use aops->read_folio().
 * @file: Passed to filler function, may be NULL if not required.
3825
 *
3826 3827
 * Read one page into the page cache.  If it succeeds, the folio returned
 * will contain @index, but it may not be the first page of the folio.
3828
 *
3829 3830
 * If the filler function returns an error, it will be returned to the
 * caller.
3831
 *
3832 3833
 * Context: May sleep.  Expects mapping->invalidate_lock to be held.
 * Return: An uptodate folio on success, ERR_PTR() on failure.
3834
 */
3835
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
3836
		filler_t filler, struct file *file)
3837
{
3838
	return do_read_cache_folio(mapping, index, filler, file,
3839 3840 3841 3842
			mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_folio);

3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866
/**
 * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
 * @mapping:	The address_space for the folio.
 * @index:	The index that the allocated folio will contain.
 * @gfp:	The page allocator flags to use if allocating.
 *
 * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
 * any new memory allocations done using the specified allocation flags.
 *
 * The most likely error from this function is EIO, but ENOMEM is
 * possible and so is EINTR.  If ->read_folio returns another error,
 * that will be returned to the caller.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: Uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *mapping_read_folio_gfp(struct address_space *mapping,
		pgoff_t index, gfp_t gfp)
{
	return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(mapping_read_folio_gfp);

3867
static struct page *do_read_cache_page(struct address_space *mapping,
3868
		pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
3869 3870 3871
{
	struct folio *folio;

3872
	folio = do_read_cache_folio(mapping, index, filler, file, gfp);
3873 3874 3875 3876 3877
	if (IS_ERR(folio))
		return &folio->page;
	return folio_file_page(folio, index);
}

3878
struct page *read_cache_page(struct address_space *mapping,
3879
			pgoff_t index, filler_t *filler, struct file *file)
3880
{
3881
	return do_read_cache_page(mapping, index, filler, file,
3882
			mapping_gfp_mask(mapping));
3883
}
3884
EXPORT_SYMBOL(read_cache_page);
3885 3886 3887 3888 3889 3890 3891 3892

/**
 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:	the page's address_space
 * @index:	the page index
 * @gfp:	the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
3893
 * any new page allocations done using the specified allocation flags.
3894 3895
 *
 * If the page does not get brought uptodate, return -EIO.
3896
 *
3897 3898
 * The function expects mapping->invalidate_lock to be already held.
 *
3899
 * Return: up to date page on success, ERR_PTR() on failure.
3900 3901 3902 3903 3904
 */
struct page *read_cache_page_gfp(struct address_space *mapping,
				pgoff_t index,
				gfp_t gfp)
{
3905
	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
3906 3907 3908
}
EXPORT_SYMBOL(read_cache_page_gfp);

3909 3910 3911
/*
 * Warn about a page cache invalidation failure during a direct I/O write.
 */
3912
static void dio_warn_stale_pagecache(struct file *filp)
3913 3914 3915 3916 3917
{
	static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
	char pathname[128];
	char *path;

3918
	errseq_set(&filp->f_mapping->wb_err, -EIO);
3919 3920 3921 3922 3923 3924 3925 3926 3927 3928
	if (__ratelimit(&_rs)) {
		path = file_path(filp, pathname, sizeof(pathname));
		if (IS_ERR(path))
			path = "(unknown)";
		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
		pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
			current->comm);
	}
}

3929
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
Linus Torvalds's avatar
Linus Torvalds committed
3930
{
3931
	struct address_space *mapping = iocb->ki_filp->f_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
3932

3933 3934 3935 3936 3937 3938
	if (mapping->nrpages &&
	    invalidate_inode_pages2_range(mapping,
			iocb->ki_pos >> PAGE_SHIFT,
			(iocb->ki_pos + count - 1) >> PAGE_SHIFT))
		dio_warn_stale_pagecache(iocb->ki_filp);
}
3939

Linus Torvalds's avatar
Linus Torvalds committed
3940
ssize_t
3941
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
Linus Torvalds's avatar
Linus Torvalds committed
3942
{
3943 3944 3945
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	size_t write_len = iov_iter_count(from);
	ssize_t written;
3946

3947 3948 3949 3950
	/*
	 * If a page can not be invalidated, return 0 to fall back
	 * to buffered write.
	 */
3951
	written = kiocb_invalidate_pages(iocb, write_len);
3952 3953 3954
	if (written) {
		if (written == -EBUSY)
			return 0;
3955
		return written;
3956 3957
	}

3958
	written = mapping->a_ops->direct_IO(iocb, from);
3959 3960 3961 3962 3963 3964 3965 3966

	/*
	 * Finally, try again to invalidate clean pages which might have been
	 * cached by non-direct readahead, or faulted in by get_user_pages()
	 * if the source of the write was an mmap'ed region of the file
	 * we're writing.  Either one is a pretty crazy thing to do,
	 * so we don't support it 100%.  If this invalidation
	 * fails, tough, the write still worked...
3967 3968 3969 3970
	 *
	 * Most of the time we do not need this since dio_complete() will do
	 * the invalidation for us. However there are some file systems that
	 * do not end up with dio_complete() being called, so let's not break
3971 3972
	 * them by removing it completely.
	 *
3973 3974
	 * Noticeable example is a blkdev_direct_IO().
	 *
3975
	 * Skip invalidation for async writes or if mapping has no pages.
3976
	 */
Linus Torvalds's avatar
Linus Torvalds committed
3977
	if (written > 0) {
3978 3979 3980 3981
		struct inode *inode = mapping->host;
		loff_t pos = iocb->ki_pos;

		kiocb_invalidate_post_direct_write(iocb, written);
3982
		pos += written;
3983
		write_len -= written;
3984 3985
		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
			i_size_write(inode, pos);
Linus Torvalds's avatar
Linus Torvalds committed
3986 3987
			mark_inode_dirty(inode);
		}
3988
		iocb->ki_pos = pos;
Linus Torvalds's avatar
Linus Torvalds committed
3989
	}
3990 3991
	if (written != -EIOCBQUEUED)
		iov_iter_revert(from, write_len - iov_iter_count(from));
Linus Torvalds's avatar
Linus Torvalds committed
3992 3993 3994 3995
	return written;
}
EXPORT_SYMBOL(generic_file_direct_write);

3996
ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
3997
{
3998 3999
	struct file *file = iocb->ki_filp;
	loff_t pos = iocb->ki_pos;
4000 4001
	struct address_space *mapping = file->f_mapping;
	const struct address_space_operations *a_ops = mapping->a_ops;
4002
	size_t chunk = mapping_max_folio_size(mapping);
4003 4004
	long status = 0;
	ssize_t written = 0;
4005

4006
	do {
4007 4008 4009
		struct folio *folio;
		size_t offset;		/* Offset into folio */
		size_t bytes;		/* Bytes to write to folio */
4010
		size_t copied;		/* Bytes copied from user */
4011
		void *fsdata = NULL;
4012

4013 4014 4015 4016 4017
		bytes = iov_iter_count(i);
retry:
		offset = pos & (chunk - 1);
		bytes = min(chunk - offset, bytes);
		balance_dirty_pages_ratelimited(mapping);
4018

4019 4020 4021 4022 4023 4024
		/*
		 * Bring in the user page that we will copy from _first_.
		 * Otherwise there's a nasty deadlock on copying from the
		 * same page as we're writing to, without it being marked
		 * up-to-date.
		 */
4025
		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
4026 4027 4028 4029
			status = -EFAULT;
			break;
		}

Jan Kara's avatar
Jan Kara committed
4030 4031 4032 4033 4034
		if (fatal_signal_pending(current)) {
			status = -EINTR;
			break;
		}

4035
		status = a_ops->write_begin(file, mapping, pos, bytes,
4036
						&folio, &fsdata);
4037
		if (unlikely(status < 0))
4038 4039
			break;

4040 4041 4042 4043
		offset = offset_in_folio(folio, pos);
		if (bytes > folio_size(folio) - offset)
			bytes = folio_size(folio) - offset;

4044
		if (mapping_writably_mapped(mapping))
4045
			flush_dcache_folio(folio);
4046

4047 4048
		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
		flush_dcache_folio(folio);
4049 4050

		status = a_ops->write_end(file, mapping, pos, bytes, copied,
4051
						folio, fsdata);
4052 4053 4054 4055 4056
		if (unlikely(status != copied)) {
			iov_iter_revert(i, copied - max(status, 0L));
			if (unlikely(status < 0))
				break;
		}
4057 4058
		cond_resched();

4059
		if (unlikely(status == 0)) {
4060
			/*
4061 4062 4063 4064
			 * A short copy made ->write_end() reject the
			 * thing entirely.  Might be memory poisoning
			 * halfway through, might be a race with munmap,
			 * might be severe memory pressure.
4065
			 */
4066 4067 4068
			if (chunk > PAGE_SIZE)
				chunk /= 2;
			if (copied) {
4069
				bytes = copied;
4070 4071 4072 4073 4074
				goto retry;
			}
		} else {
			pos += status;
			written += status;
4075 4076 4077
		}
	} while (iov_iter_count(i));

4078 4079 4080 4081
	if (!written)
		return status;
	iocb->ki_pos += written;
	return written;
4082
}
4083
EXPORT_SYMBOL(generic_perform_write);
Linus Torvalds's avatar
Linus Torvalds committed
4084

4085
/**
4086
 * __generic_file_write_iter - write data to a file
4087
 * @iocb:	IO state structure (file, offset, etc.)
4088
 * @from:	iov_iter with data to write
4089 4090 4091 4092 4093 4094
 *
 * This function does all the work needed for actually writing data to a
 * file. It does all basic checks, removes SUID from the file, updates
 * modification times and calls proper subroutines depending on whether we
 * do direct IO or a standard buffered write.
 *
4095
 * It expects i_rwsem to be grabbed unless we work on a block device or similar
4096 4097 4098 4099
 * object which does not need locking at all.
 *
 * This function does *not* take care of syncing data in case of O_SYNC write.
 * A caller has to handle it. This is mainly due to the fact that we want to
4100
 * avoid syncing under i_rwsem.
4101 4102 4103 4104
 *
 * Return:
 * * number of bytes written, even for truncated writes
 * * negative error code if no data has been written at all
4105
 */
4106
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
Linus Torvalds's avatar
Linus Torvalds committed
4107 4108
{
	struct file *file = iocb->ki_filp;
4109
	struct address_space *mapping = file->f_mapping;
4110 4111
	struct inode *inode = mapping->host;
	ssize_t ret;
Linus Torvalds's avatar
Linus Torvalds committed
4112

4113 4114 4115
	ret = file_remove_privs(file);
	if (ret)
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
4116

4117 4118 4119
	ret = file_update_time(file);
	if (ret)
		return ret;
4120

4121
	if (iocb->ki_flags & IOCB_DIRECT) {
4122
		ret = generic_file_direct_write(iocb, from);
Linus Torvalds's avatar
Linus Torvalds committed
4123
		/*
4124 4125 4126 4127 4128
		 * If the write stopped short of completing, fall back to
		 * buffered writes.  Some filesystems do this for writes to
		 * holes, for example.  For DAX files, a buffered write will
		 * not succeed (even if it did, DAX does not handle dirty
		 * page-cache pages correctly).
Linus Torvalds's avatar
Linus Torvalds committed
4129
		 */
4130 4131 4132 4133
		if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
			return ret;
		return direct_write_fallback(iocb, from, ret,
				generic_perform_write(iocb, from));
4134
	}
4135 4136

	return generic_perform_write(iocb, from);
Linus Torvalds's avatar
Linus Torvalds committed
4137
}
4138
EXPORT_SYMBOL(__generic_file_write_iter);
4139 4140

/**
4141
 * generic_file_write_iter - write data to a file
4142
 * @iocb:	IO state structure
4143
 * @from:	iov_iter with data to write
4144
 *
4145
 * This is a wrapper around __generic_file_write_iter() to be used by most
4146
 * filesystems. It takes care of syncing the file in case of O_SYNC file
4147
 * and acquires i_rwsem as needed.
4148 4149 4150 4151
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
4152
 */
4153
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
Linus Torvalds's avatar
Linus Torvalds committed
4154 4155
{
	struct file *file = iocb->ki_filp;
4156
	struct inode *inode = file->f_mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
4157 4158
	ssize_t ret;

Al Viro's avatar
Al Viro committed
4159
	inode_lock(inode);
4160 4161
	ret = generic_write_checks(iocb, from);
	if (ret > 0)
4162
		ret = __generic_file_write_iter(iocb, from);
Al Viro's avatar
Al Viro committed
4163
	inode_unlock(inode);
Linus Torvalds's avatar
Linus Torvalds committed
4164

4165 4166
	if (ret > 0)
		ret = generic_write_sync(iocb, ret);
Linus Torvalds's avatar
Linus Torvalds committed
4167 4168
	return ret;
}
4169
EXPORT_SYMBOL(generic_file_write_iter);
Linus Torvalds's avatar
Linus Torvalds committed
4170

4171
/**
4172 4173 4174
 * filemap_release_folio() - Release fs-specific metadata on a folio.
 * @folio: The folio which the kernel is trying to free.
 * @gfp: Memory allocation flags (and I/O mode).
4175
 *
4176 4177
 * The address_space is trying to release any data attached to a folio
 * (presumably at folio->private).
4178
 *
4179 4180
 * This will also be called if the private_2 flag is set on a page,
 * indicating that the folio has other metadata associated with it.
4181
 *
4182 4183 4184
 * The @gfp argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block
 * (__GFP_RECLAIM & __GFP_FS).
4185
 *
4186
 * Return: %true if the release was successful, otherwise %false.
4187
 */
4188
bool filemap_release_folio(struct folio *folio, gfp_t gfp)
4189
{
4190
	struct address_space * const mapping = folio->mapping;
4191

4192
	BUG_ON(!folio_test_locked(folio));
4193 4194
	if (!folio_needs_release(folio))
		return true;
4195 4196
	if (folio_test_writeback(folio))
		return false;
4197

4198 4199
	if (mapping && mapping->a_ops->release_folio)
		return mapping->a_ops->release_folio(folio, gfp);
4200
	return try_to_free_buffers(folio);
4201
}
4202
EXPORT_SYMBOL(filemap_release_folio);
4203

4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248
/**
 * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
 * @inode: The inode to flush
 * @flush: Set to write back rather than simply invalidate.
 * @start: First byte to in range.
 * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
 *       onwards.
 *
 * Invalidate all the folios on an inode that contribute to the specified
 * range, possibly writing them back first.  Whilst the operation is
 * undertaken, the invalidate lock is held to prevent new folios from being
 * installed.
 */
int filemap_invalidate_inode(struct inode *inode, bool flush,
			     loff_t start, loff_t end)
{
	struct address_space *mapping = inode->i_mapping;
	pgoff_t first = start >> PAGE_SHIFT;
	pgoff_t last = end >> PAGE_SHIFT;
	pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;

	if (!mapping || !mapping->nrpages || end < start)
		goto out;

	/* Prevent new folios from being added to the inode. */
	filemap_invalidate_lock(mapping);

	if (!mapping->nrpages)
		goto unlock;

	unmap_mapping_pages(mapping, first, nr, false);

	/* Write back the data if we're asked to. */
	if (flush) {
		struct writeback_control wbc = {
			.sync_mode	= WB_SYNC_ALL,
			.nr_to_write	= LONG_MAX,
			.range_start	= start,
			.range_end	= end,
		};

		filemap_fdatawrite_wbc(mapping, &wbc);
	}

	/* Wait for writeback to complete on all folios and discard. */
4249
	invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
4250 4251 4252 4253 4254 4255 4256 4257

unlock:
	filemap_invalidate_unlock(mapping);
out:
	return filemap_check_errors(mapping);
}
EXPORT_SYMBOL_GPL(filemap_invalidate_inode);

4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276
#ifdef CONFIG_CACHESTAT_SYSCALL
/**
 * filemap_cachestat() - compute the page cache statistics of a mapping
 * @mapping:	The mapping to compute the statistics for.
 * @first_index:	The starting page cache index.
 * @last_index:	The final page index (inclusive).
 * @cs:	the cachestat struct to write the result to.
 *
 * This will query the page cache statistics of a mapping in the
 * page range of [first_index, last_index] (inclusive). The statistics
 * queried include: number of dirty pages, number of pages marked for
 * writeback, and the number of (recently) evicted pages.
 */
static void filemap_cachestat(struct address_space *mapping,
		pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
{
	XA_STATE(xas, &mapping->i_pages, first_index);
	struct folio *folio;

4277 4278 4279
	/* Flush stats (and potentially sleep) outside the RCU read section. */
	mem_cgroup_flush_stats_ratelimited(NULL);

4280 4281
	rcu_read_lock();
	xas_for_each(&xas, folio, last_index) {
4282
		int order;
4283 4284 4285
		unsigned long nr_pages;
		pgoff_t folio_first_index, folio_last_index;

4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296
		/*
		 * Don't deref the folio. It is not pinned, and might
		 * get freed (and reused) underneath us.
		 *
		 * We *could* pin it, but that would be expensive for
		 * what should be a fast and lightweight syscall.
		 *
		 * Instead, derive all information of interest from
		 * the rcu-protected xarray.
		 */

4297 4298 4299
		if (xas_retry(&xas, folio))
			continue;

4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311
		order = xa_get_order(xas.xa, xas.xa_index);
		nr_pages = 1 << order;
		folio_first_index = round_down(xas.xa_index, 1 << order);
		folio_last_index = folio_first_index + nr_pages - 1;

		/* Folios might straddle the range boundaries, only count covered pages */
		if (folio_first_index < first_index)
			nr_pages -= first_index - folio_first_index;

		if (folio_last_index > last_index)
			nr_pages -= folio_last_index - last_index;

4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323
		if (xa_is_value(folio)) {
			/* page is evicted */
			void *shadow = (void *)folio;
			bool workingset; /* not used */

			cs->nr_evicted += nr_pages;

#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
			if (shmem_mapping(mapping)) {
				/* shmem file - in swap cache */
				swp_entry_t swp = radix_to_swp_entry(folio);

4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337
				/* swapin error results in poisoned entry */
				if (non_swap_entry(swp))
					goto resched;

				/*
				 * Getting a swap entry from the shmem
				 * inode means we beat
				 * shmem_unuse(). rcu_read_lock()
				 * ensures swapoff waits for us before
				 * freeing the swapper space. However,
				 * we can race with swapping and
				 * invalidation, so there might not be
				 * a shadow in the swapcache (yet).
				 */
4338
				shadow = get_shadow_from_swap_cache(swp);
4339 4340
				if (!shadow)
					goto resched;
4341 4342
			}
#endif
4343
			if (workingset_test_recent(shadow, true, &workingset, false))
4344 4345 4346 4347 4348 4349 4350 4351
				cs->nr_recently_evicted += nr_pages;

			goto resched;
		}

		/* page is in cache */
		cs->nr_cache += nr_pages;

4352
		if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
4353 4354
			cs->nr_dirty += nr_pages;

4355
		if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444
			cs->nr_writeback += nr_pages;

resched:
		if (need_resched()) {
			xas_pause(&xas);
			cond_resched_rcu();
		}
	}
	rcu_read_unlock();
}

/*
 * The cachestat(2) system call.
 *
 * cachestat() returns the page cache statistics of a file in the
 * bytes range specified by `off` and `len`: number of cached pages,
 * number of dirty pages, number of pages marked for writeback,
 * number of evicted pages, and number of recently evicted pages.
 *
 * An evicted page is a page that is previously in the page cache
 * but has been evicted since. A page is recently evicted if its last
 * eviction was recent enough that its reentry to the cache would
 * indicate that it is actively being used by the system, and that
 * there is memory pressure on the system.
 *
 * `off` and `len` must be non-negative integers. If `len` > 0,
 * the queried range is [`off`, `off` + `len`]. If `len` == 0,
 * we will query in the range from `off` to the end of the file.
 *
 * The `flags` argument is unused for now, but is included for future
 * extensibility. User should pass 0 (i.e no flag specified).
 *
 * Currently, hugetlbfs is not supported.
 *
 * Because the status of a page can change after cachestat() checks it
 * but before it returns to the application, the returned values may
 * contain stale information.
 *
 * return values:
 *  zero        - success
 *  -EFAULT     - cstat or cstat_range points to an illegal address
 *  -EINVAL     - invalid flags
 *  -EBADF      - invalid file descriptor
 *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
 */
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
		struct cachestat_range __user *, cstat_range,
		struct cachestat __user *, cstat, unsigned int, flags)
{
	struct fd f = fdget(fd);
	struct address_space *mapping;
	struct cachestat_range csr;
	struct cachestat cs;
	pgoff_t first_index, last_index;

	if (!f.file)
		return -EBADF;

	if (copy_from_user(&csr, cstat_range,
			sizeof(struct cachestat_range))) {
		fdput(f);
		return -EFAULT;
	}

	/* hugetlbfs is not supported */
	if (is_file_hugepages(f.file)) {
		fdput(f);
		return -EOPNOTSUPP;
	}

	if (flags != 0) {
		fdput(f);
		return -EINVAL;
	}

	first_index = csr.off >> PAGE_SHIFT;
	last_index =
		csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
	memset(&cs, 0, sizeof(struct cachestat));
	mapping = f.file->f_mapping;
	filemap_cachestat(mapping, first_index, last_index, &cs);
	fdput(f);

	if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
		return -EFAULT;

	return 0;
}
#endif /* CONFIG_CACHESTAT_SYSCALL */