1/*
2 * Copyright 2010-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2010, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11#include <string.h>
12#include <stdlib.h>
13
14#include <algorithm>
15
16#include <KernelExport.h>
17#include <OS.h>
18
19#include <AutoDeleter.h>
20
21#include <arch/cpu.h>
22#include <arch/vm_translation_map.h>
23#include <block_cache.h>
24#include <boot/kernel_args.h>
25#include <condition_variable.h>
26#include <elf.h>
27#include <heap.h>
28#include <kernel.h>
29#include <low_resource_manager.h>
30#include <thread.h>
31#include <tracing.h>
32#include <util/AutoLock.h>
33#include <vfs.h>
34#include <vm/vm.h>
35#include <vm/vm_priv.h>
36#include <vm/vm_page.h>
37#include <vm/VMAddressSpace.h>
38#include <vm/VMArea.h>
39#include <vm/VMCache.h>
40
41#include "IORequest.h"
42#include "PageCacheLocker.h"
43#include "VMAnonymousCache.h"
44#include "VMPageQueue.h"
45
46
47//#define TRACE_VM_PAGE
48#ifdef TRACE_VM_PAGE
49#	define TRACE(x) dprintf x
50#else
51#	define TRACE(x) ;
52#endif
53
54//#define TRACE_VM_DAEMONS
55#ifdef TRACE_VM_DAEMONS
56#define TRACE_DAEMON(x...) dprintf(x)
57#else
58#define TRACE_DAEMON(x...) do {} while (false)
59#endif
60
61//#define TRACK_PAGE_USAGE_STATS	1
62
63#define PAGE_ASSERT(page, condition)	\
64	ASSERT_PRINT((condition), "page: %p", (page))
65
66#define SCRUB_SIZE 16
67	// this many pages will be cleared at once in the page scrubber thread
68
69#define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
70	// maximum I/O priority of the page writer
71#define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
72	// the maximum I/O priority shall be reached when this many pages need to
73	// be written
74
75
76// The page reserve an allocation of the certain priority must not touch.
77static const size_t kPageReserveForPriority[] = {
78	VM_PAGE_RESERVE_USER,		// user
79	VM_PAGE_RESERVE_SYSTEM,		// system
80	0							// VIP
81};
82
83// Minimum number of free pages the page daemon will try to achieve.
84static uint32 sFreePagesTarget;
85static uint32 sFreeOrCachedPagesTarget;
86static uint32 sInactivePagesTarget;
87
88// Wait interval between page daemon runs.
89static const bigtime_t kIdleScanWaitInterval = 1000000LL;	// 1 sec
90static const bigtime_t kBusyScanWaitInterval = 500000LL;	// 0.5 sec
91
92// Number of idle runs after which we want to have processed the full active
93// queue.
94static const uint32 kIdleRunsForFullQueue = 20;
95
96// Maximum limit for the vm_page::usage_count.
97static const int32 kPageUsageMax = 64;
98// vm_page::usage_count buff an accessed page receives in a scan.
99static const int32 kPageUsageAdvance = 3;
100// vm_page::usage_count debuff an unaccessed page receives in a scan.
101static const int32 kPageUsageDecline = 1;
102
103int32 gMappedPagesCount;
104
105static VMPageQueue sPageQueues[PAGE_STATE_COUNT];
106
107static VMPageQueue& sFreePageQueue = sPageQueues[PAGE_STATE_FREE];
108static VMPageQueue& sClearPageQueue = sPageQueues[PAGE_STATE_CLEAR];
109static VMPageQueue& sModifiedPageQueue = sPageQueues[PAGE_STATE_MODIFIED];
110static VMPageQueue& sInactivePageQueue = sPageQueues[PAGE_STATE_INACTIVE];
111static VMPageQueue& sActivePageQueue = sPageQueues[PAGE_STATE_ACTIVE];
112static VMPageQueue& sCachedPageQueue = sPageQueues[PAGE_STATE_CACHED];
113
114static vm_page *sPages;
115static page_num_t sPhysicalPageOffset;
116static page_num_t sNumPages;
117static page_num_t sNonExistingPages;
118	// pages in the sPages array that aren't backed by physical memory
119static uint64 sIgnoredPages;
120	// pages of physical memory ignored by the boot loader (and thus not
121	// available here)
122static int32 sUnreservedFreePages;
123static int32 sUnsatisfiedPageReservations;
124static int32 sModifiedTemporaryPages;
125
126static ConditionVariable sFreePageCondition;
127static mutex sPageDeficitLock = MUTEX_INITIALIZER("page deficit");
128
129// This lock must be used whenever the free or clear page queues are changed.
130// If you need to work on both queues at the same time, you need to hold a write
131// lock, otherwise, a read lock suffices (each queue still has a spinlock to
132// guard against concurrent changes).
133static rw_lock sFreePageQueuesLock
134	= RW_LOCK_INITIALIZER("free/clear page queues");
135
136#ifdef TRACK_PAGE_USAGE_STATS
137static page_num_t sPageUsageArrays[512];
138static page_num_t* sPageUsage = sPageUsageArrays;
139static page_num_t sPageUsagePageCount;
140static page_num_t* sNextPageUsage = sPageUsageArrays + 256;
141static page_num_t sNextPageUsagePageCount;
142#endif
143
144
145#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
146
147struct caller_info {
148	addr_t		caller;
149	size_t		count;
150};
151
152static const int32 kCallerInfoTableSize = 1024;
153static caller_info sCallerInfoTable[kCallerInfoTableSize];
154static int32 sCallerInfoCount = 0;
155
156static caller_info* get_caller_info(addr_t caller);
157
158
159RANGE_MARKER_FUNCTION_PROTOTYPES(vm_page)
160
161static const addr_t kVMPageCodeAddressRange[] = {
162	RANGE_MARKER_FUNCTION_ADDRESS_RANGE(vm_page)
163};
164
165#endif
166
167
168RANGE_MARKER_FUNCTION_BEGIN(vm_page)
169
170
171struct page_stats {
172	int32	totalFreePages;
173	int32	unsatisfiedReservations;
174	int32	cachedPages;
175};
176
177
178struct PageReservationWaiter
179		: public DoublyLinkedListLinkImpl<PageReservationWaiter> {
180	Thread*	thread;
181	uint32	dontTouch;		// reserve not to touch
182	uint32	missing;		// pages missing for the reservation
183	int32	threadPriority;
184
185	bool operator<(const PageReservationWaiter& other) const
186	{
187		// Implies an order by descending VM priority (ascending dontTouch)
188		// and (secondarily) descending thread priority.
189		if (dontTouch != other.dontTouch)
190			return dontTouch < other.dontTouch;
191		return threadPriority > other.threadPriority;
192	}
193};
194
195typedef DoublyLinkedList<PageReservationWaiter> PageReservationWaiterList;
196static PageReservationWaiterList sPageReservationWaiters;
197
198
199struct DaemonCondition {
200	void Init(const char* name)
201	{
202		mutex_init(&fLock, "daemon condition");
203		fCondition.Init(this, name);
204		fActivated = false;
205	}
206
207	bool Lock()
208	{
209		return mutex_lock(&fLock) == B_OK;
210	}
211
212	void Unlock()
213	{
214		mutex_unlock(&fLock);
215	}
216
217	bool Wait(bigtime_t timeout, bool clearActivated)
218	{
219		MutexLocker locker(fLock);
220		if (clearActivated)
221			fActivated = false;
222		else if (fActivated)
223			return true;
224
225		ConditionVariableEntry entry;
226		fCondition.Add(&entry);
227
228		locker.Unlock();
229
230		return entry.Wait(B_RELATIVE_TIMEOUT, timeout) == B_OK;
231	}
232
233	void WakeUp()
234	{
235		if (fActivated)
236			return;
237
238		MutexLocker locker(fLock);
239		fActivated = true;
240		fCondition.NotifyOne();
241	}
242
243	void ClearActivated()
244	{
245		MutexLocker locker(fLock);
246		fActivated = false;
247	}
248
249private:
250	mutex				fLock;
251	ConditionVariable	fCondition;
252	bool				fActivated;
253};
254
255
256static DaemonCondition sPageWriterCondition;
257static DaemonCondition sPageDaemonCondition;
258
259
260#if PAGE_ALLOCATION_TRACING
261
262namespace PageAllocationTracing {
263
264class ReservePages : public AbstractTraceEntry {
265public:
266	ReservePages(uint32 count)
267		:
268		fCount(count)
269	{
270		Initialized();
271	}
272
273	virtual void AddDump(TraceOutput& out)
274	{
275		out.Print("page reserve:   %" B_PRIu32, fCount);
276	}
277
278private:
279	uint32		fCount;
280};
281
282
283class UnreservePages : public AbstractTraceEntry {
284public:
285	UnreservePages(uint32 count)
286		:
287		fCount(count)
288	{
289		Initialized();
290	}
291
292	virtual void AddDump(TraceOutput& out)
293	{
294		out.Print("page unreserve: %" B_PRId32, fCount);
295	}
296
297private:
298	uint32		fCount;
299};
300
301
302class AllocatePage
303	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
304public:
305	AllocatePage(page_num_t pageNumber)
306		:
307		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
308		fPageNumber(pageNumber)
309	{
310		Initialized();
311	}
312
313	virtual void AddDump(TraceOutput& out)
314	{
315		out.Print("page alloc: %#" B_PRIxPHYSADDR, fPageNumber);
316	}
317
318private:
319	page_num_t	fPageNumber;
320};
321
322
323class AllocatePageRun
324	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
325public:
326	AllocatePageRun(page_num_t startPage, uint32 length)
327		:
328		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
329		fStartPage(startPage),
330		fLength(length)
331	{
332		Initialized();
333	}
334
335	virtual void AddDump(TraceOutput& out)
336	{
337		out.Print("page alloc run: start %#" B_PRIxPHYSADDR " length: %"
338			B_PRIu32, fStartPage, fLength);
339	}
340
341private:
342	page_num_t	fStartPage;
343	uint32		fLength;
344};
345
346
347class FreePage
348	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
349public:
350	FreePage(page_num_t pageNumber)
351		:
352		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
353		fPageNumber(pageNumber)
354	{
355		Initialized();
356	}
357
358	virtual void AddDump(TraceOutput& out)
359	{
360		out.Print("page free: %#" B_PRIxPHYSADDR, fPageNumber);
361	}
362
363private:
364	page_num_t	fPageNumber;
365};
366
367
368class ScrubbingPages : public AbstractTraceEntry {
369public:
370	ScrubbingPages(uint32 count)
371		:
372		fCount(count)
373	{
374		Initialized();
375	}
376
377	virtual void AddDump(TraceOutput& out)
378	{
379		out.Print("page scrubbing: %" B_PRId32, fCount);
380	}
381
382private:
383	uint32		fCount;
384};
385
386
387class ScrubbedPages : public AbstractTraceEntry {
388public:
389	ScrubbedPages(uint32 count)
390		:
391		fCount(count)
392	{
393		Initialized();
394	}
395
396	virtual void AddDump(TraceOutput& out)
397	{
398		out.Print("page scrubbed:  %" B_PRId32, fCount);
399	}
400
401private:
402	uint32		fCount;
403};
404
405
406class StolenPage : public AbstractTraceEntry {
407public:
408	StolenPage()
409	{
410		Initialized();
411	}
412
413	virtual void AddDump(TraceOutput& out)
414	{
415		out.Print("page stolen");
416	}
417};
418
419}	// namespace PageAllocationTracing
420
421#	define TA(x)	new(std::nothrow) PageAllocationTracing::x
422
423#else
424#	define TA(x)
425#endif	// PAGE_ALLOCATION_TRACING
426
427
428#if PAGE_DAEMON_TRACING
429
430namespace PageDaemonTracing {
431
432class ActivatePage : public AbstractTraceEntry {
433	public:
434		ActivatePage(vm_page* page)
435			:
436			fCache(page->cache),
437			fPage(page)
438		{
439			Initialized();
440		}
441
442		virtual void AddDump(TraceOutput& out)
443		{
444			out.Print("page activated:   %p, cache: %p", fPage, fCache);
445		}
446
447	private:
448		VMCache*	fCache;
449		vm_page*	fPage;
450};
451
452
453class DeactivatePage : public AbstractTraceEntry {
454	public:
455		DeactivatePage(vm_page* page)
456			:
457			fCache(page->cache),
458			fPage(page)
459		{
460			Initialized();
461		}
462
463		virtual void AddDump(TraceOutput& out)
464		{
465			out.Print("page deactivated: %p, cache: %p", fPage, fCache);
466		}
467
468	private:
469		VMCache*	fCache;
470		vm_page*	fPage;
471};
472
473
474class FreedPageSwap : public AbstractTraceEntry {
475	public:
476		FreedPageSwap(vm_page* page)
477			:
478			fCache(page->cache),
479			fPage(page)
480		{
481			Initialized();
482		}
483
484		virtual void AddDump(TraceOutput& out)
485		{
486			out.Print("page swap freed:  %p, cache: %p", fPage, fCache);
487		}
488
489	private:
490		VMCache*	fCache;
491		vm_page*	fPage;
492};
493
494}	// namespace PageDaemonTracing
495
496#	define TD(x)	new(std::nothrow) PageDaemonTracing::x
497
498#else
499#	define TD(x)
500#endif	// PAGE_DAEMON_TRACING
501
502
503#if PAGE_WRITER_TRACING
504
505namespace PageWriterTracing {
506
507class WritePage : public AbstractTraceEntry {
508	public:
509		WritePage(vm_page* page)
510			:
511			fCache(page->Cache()),
512			fPage(page)
513		{
514			Initialized();
515		}
516
517		virtual void AddDump(TraceOutput& out)
518		{
519			out.Print("page write: %p, cache: %p", fPage, fCache);
520		}
521
522	private:
523		VMCache*	fCache;
524		vm_page*	fPage;
525};
526
527}	// namespace PageWriterTracing
528
529#	define TPW(x)	new(std::nothrow) PageWriterTracing::x
530
531#else
532#	define TPW(x)
533#endif	// PAGE_WRITER_TRACING
534
535
536#if PAGE_STATE_TRACING
537
538namespace PageStateTracing {
539
540class SetPageState : public AbstractTraceEntry {
541	public:
542		SetPageState(vm_page* page, uint8 newState)
543			:
544			fPage(page),
545			fOldState(page->State()),
546			fNewState(newState),
547			fBusy(page->busy),
548			fWired(page->WiredCount() > 0),
549			fMapped(!page->mappings.IsEmpty()),
550			fAccessed(page->accessed),
551			fModified(page->modified)
552		{
553#if PAGE_STATE_TRACING_STACK_TRACE
554			fStackTrace = capture_tracing_stack_trace(
555				PAGE_STATE_TRACING_STACK_TRACE, 0, true);
556				// Don't capture userland stack trace to avoid potential
557				// deadlocks.
558#endif
559			Initialized();
560		}
561
562#if PAGE_STATE_TRACING_STACK_TRACE
563		virtual void DumpStackTrace(TraceOutput& out)
564		{
565			out.PrintStackTrace(fStackTrace);
566		}
567#endif
568
569		virtual void AddDump(TraceOutput& out)
570		{
571			out.Print("page set state: %p (%c%c%c%c%c): %s -> %s", fPage,
572				fBusy ? 'b' : '-',
573				fWired ? 'w' : '-',
574				fMapped ? 'm' : '-',
575				fAccessed ? 'a' : '-',
576				fModified ? 'm' : '-',
577				page_state_to_string(fOldState),
578				page_state_to_string(fNewState));
579		}
580
581	private:
582		vm_page*	fPage;
583#if PAGE_STATE_TRACING_STACK_TRACE
584		tracing_stack_trace* fStackTrace;
585#endif
586		uint8		fOldState;
587		uint8		fNewState;
588		bool		fBusy : 1;
589		bool		fWired : 1;
590		bool		fMapped : 1;
591		bool		fAccessed : 1;
592		bool		fModified : 1;
593};
594
595}	// namespace PageStateTracing
596
597#	define TPS(x)	new(std::nothrow) PageStateTracing::x
598
599#else
600#	define TPS(x)
601#endif	// PAGE_STATE_TRACING
602
603
604#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
605
606namespace BKernel {
607
608class AllocationTrackingCallback {
609public:
610	virtual						~AllocationTrackingCallback();
611
612	virtual	bool				ProcessTrackingInfo(
613									AllocationTrackingInfo* info,
614									page_num_t pageNumber) = 0;
615};
616
617}
618
619using BKernel::AllocationTrackingCallback;
620
621
622class AllocationCollectorCallback : public AllocationTrackingCallback {
623public:
624	AllocationCollectorCallback(bool resetInfos)
625		:
626		fResetInfos(resetInfos)
627	{
628	}
629
630	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
631		page_num_t pageNumber)
632	{
633		if (!info->IsInitialized())
634			return true;
635
636		addr_t caller = 0;
637		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
638
639		if (traceEntry != NULL && info->IsTraceEntryValid()) {
640			caller = tracing_find_caller_in_stack_trace(
641				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
642		}
643
644		caller_info* callerInfo = get_caller_info(caller);
645		if (callerInfo == NULL) {
646			kprintf("out of space for caller infos\n");
647			return false;
648		}
649
650		callerInfo->count++;
651
652		if (fResetInfos)
653			info->Clear();
654
655		return true;
656	}
657
658private:
659	bool	fResetInfos;
660};
661
662
663class AllocationInfoPrinterCallback : public AllocationTrackingCallback {
664public:
665	AllocationInfoPrinterCallback(bool printStackTrace, page_num_t pageFilter,
666		team_id teamFilter, thread_id threadFilter)
667		:
668		fPrintStackTrace(printStackTrace),
669		fPageFilter(pageFilter),
670		fTeamFilter(teamFilter),
671		fThreadFilter(threadFilter)
672	{
673	}
674
675	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
676		page_num_t pageNumber)
677	{
678		if (!info->IsInitialized())
679			return true;
680
681		if (fPageFilter != 0 && pageNumber != fPageFilter)
682			return true;
683
684		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
685		if (traceEntry != NULL && !info->IsTraceEntryValid())
686			traceEntry = NULL;
687
688		if (traceEntry != NULL) {
689			if (fTeamFilter != -1 && traceEntry->TeamID() != fTeamFilter)
690				return true;
691			if (fThreadFilter != -1 && traceEntry->ThreadID() != fThreadFilter)
692				return true;
693		} else {
694			// we need the info if we have filters set
695			if (fTeamFilter != -1 || fThreadFilter != -1)
696				return true;
697		}
698
699		kprintf("page number %#" B_PRIxPHYSADDR, pageNumber);
700
701		if (traceEntry != NULL) {
702			kprintf(", team: %" B_PRId32 ", thread %" B_PRId32
703				", time %" B_PRId64 "\n", traceEntry->TeamID(),
704				traceEntry->ThreadID(), traceEntry->Time());
705
706			if (fPrintStackTrace)
707				tracing_print_stack_trace(traceEntry->StackTrace());
708		} else
709			kprintf("\n");
710
711		return true;
712	}
713
714private:
715	bool		fPrintStackTrace;
716	page_num_t	fPageFilter;
717	team_id		fTeamFilter;
718	thread_id	fThreadFilter;
719};
720
721
722class AllocationDetailPrinterCallback : public AllocationTrackingCallback {
723public:
724	AllocationDetailPrinterCallback(addr_t caller)
725		:
726		fCaller(caller)
727	{
728	}
729
730	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
731		page_num_t pageNumber)
732	{
733		if (!info->IsInitialized())
734			return true;
735
736		addr_t caller = 0;
737		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
738		if (traceEntry != NULL && !info->IsTraceEntryValid())
739			traceEntry = NULL;
740
741		if (traceEntry != NULL) {
742			caller = tracing_find_caller_in_stack_trace(
743				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
744		}
745
746		if (caller != fCaller)
747			return true;
748
749		kprintf("page %#" B_PRIxPHYSADDR "\n", pageNumber);
750		if (traceEntry != NULL)
751			tracing_print_stack_trace(traceEntry->StackTrace());
752
753		return true;
754	}
755
756private:
757	addr_t	fCaller;
758};
759
760#endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
761
762
763static int
764find_page(int argc, char **argv)
765{
766	struct vm_page *page;
767	addr_t address;
768	int32 index = 1;
769	int i;
770
771	struct {
772		const char*	name;
773		VMPageQueue*	queue;
774	} pageQueueInfos[] = {
775		{ "free",		&sFreePageQueue },
776		{ "clear",		&sClearPageQueue },
777		{ "modified",	&sModifiedPageQueue },
778		{ "active",		&sActivePageQueue },
779		{ "inactive",	&sInactivePageQueue },
780		{ "cached",		&sCachedPageQueue },
781		{ NULL, NULL }
782	};
783
784	if (argc < 2
785		|| strlen(argv[index]) <= 2
786		|| argv[index][0] != '0'
787		|| argv[index][1] != 'x') {
788		kprintf("usage: find_page <address>\n");
789		return 0;
790	}
791
792	address = strtoul(argv[index], NULL, 0);
793	page = (vm_page*)address;
794
795	for (i = 0; pageQueueInfos[i].name; i++) {
796		VMPageQueue::Iterator it = pageQueueInfos[i].queue->GetIterator();
797		while (vm_page* p = it.Next()) {
798			if (p == page) {
799				kprintf("found page %p in queue %p (%s)\n", page,
800					pageQueueInfos[i].queue, pageQueueInfos[i].name);
801				return 0;
802			}
803		}
804	}
805
806	kprintf("page %p isn't in any queue\n", page);
807
808	return 0;
809}
810
811
812const char *
813page_state_to_string(int state)
814{
815	switch(state) {
816		case PAGE_STATE_ACTIVE:
817			return "active";
818		case PAGE_STATE_INACTIVE:
819			return "inactive";
820		case PAGE_STATE_MODIFIED:
821			return "modified";
822		case PAGE_STATE_CACHED:
823			return "cached";
824		case PAGE_STATE_FREE:
825			return "free";
826		case PAGE_STATE_CLEAR:
827			return "clear";
828		case PAGE_STATE_WIRED:
829			return "wired";
830		case PAGE_STATE_UNUSED:
831			return "unused";
832		default:
833			return "unknown";
834	}
835}
836
837
838static int
839dump_page(int argc, char **argv)
840{
841	bool addressIsPointer = true;
842	bool physical = false;
843	bool searchMappings = false;
844	int32 index = 1;
845
846	while (index < argc) {
847		if (argv[index][0] != '-')
848			break;
849
850		if (!strcmp(argv[index], "-p")) {
851			addressIsPointer = false;
852			physical = true;
853		} else if (!strcmp(argv[index], "-v")) {
854			addressIsPointer = false;
855		} else if (!strcmp(argv[index], "-m")) {
856			searchMappings = true;
857		} else {
858			print_debugger_command_usage(argv[0]);
859			return 0;
860		}
861
862		index++;
863	}
864
865	if (index + 1 != argc) {
866		print_debugger_command_usage(argv[0]);
867		return 0;
868	}
869
870	uint64 value;
871	if (!evaluate_debug_expression(argv[index], &value, false))
872		return 0;
873
874	uint64 pageAddress = value;
875	struct vm_page* page;
876
877	if (addressIsPointer) {
878		page = (struct vm_page *)(addr_t)pageAddress;
879	} else {
880		if (!physical) {
881			VMAddressSpace *addressSpace = VMAddressSpace::Kernel();
882
883			if (debug_get_debugged_thread()->team->address_space != NULL)
884				addressSpace = debug_get_debugged_thread()->team->address_space;
885
886			uint32 flags = 0;
887			phys_addr_t physicalAddress;
888			if (addressSpace->TranslationMap()->QueryInterrupt(pageAddress,
889					&physicalAddress, &flags) != B_OK
890				|| (flags & PAGE_PRESENT) == 0) {
891				kprintf("Virtual address not mapped to a physical page in this "
892					"address space.\n");
893				return 0;
894			}
895			pageAddress = physicalAddress;
896		}
897
898		page = vm_lookup_page(pageAddress / B_PAGE_SIZE);
899	}
900
901	kprintf("PAGE: %p\n", page);
902	kprintf("queue_next,prev: %p, %p\n", page->queue_link.next,
903		page->queue_link.previous);
904	kprintf("physical_number: %#" B_PRIxPHYSADDR "\n",
905		page->physical_page_number);
906	kprintf("cache:           %p\n", page->Cache());
907	kprintf("cache_offset:    %" B_PRIuPHYSADDR "\n", page->cache_offset);
908	kprintf("cache_next:      %p\n", page->cache_next);
909	kprintf("state:           %s\n", page_state_to_string(page->State()));
910	kprintf("wired_count:     %d\n", page->WiredCount());
911	kprintf("usage_count:     %d\n", page->usage_count);
912	kprintf("busy:            %d\n", page->busy);
913	kprintf("busy_writing:    %d\n", page->busy_writing);
914	kprintf("accessed:        %d\n", page->accessed);
915	kprintf("modified:        %d\n", page->modified);
916	#if DEBUG_PAGE_QUEUE
917		kprintf("queue:           %p\n", page->queue);
918	#endif
919	#if DEBUG_PAGE_ACCESS
920		kprintf("accessor:        %" B_PRId32 "\n", page->accessing_thread);
921	#endif
922	kprintf("area mappings:\n");
923
924	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
925	vm_page_mapping *mapping;
926	while ((mapping = iterator.Next()) != NULL) {
927		kprintf("  %p (%" B_PRId32 ")\n", mapping->area, mapping->area->id);
928		mapping = mapping->page_link.next;
929	}
930
931	if (searchMappings) {
932		kprintf("all mappings:\n");
933		VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
934		while (addressSpace != NULL) {
935			size_t pageCount = addressSpace->Size() / B_PAGE_SIZE;
936			for (addr_t address = addressSpace->Base(); pageCount != 0;
937					address += B_PAGE_SIZE, pageCount--) {
938				phys_addr_t physicalAddress;
939				uint32 flags = 0;
940				if (addressSpace->TranslationMap()->QueryInterrupt(address,
941						&physicalAddress, &flags) == B_OK
942					&& (flags & PAGE_PRESENT) != 0
943					&& physicalAddress / B_PAGE_SIZE
944						== page->physical_page_number) {
945					VMArea* area = addressSpace->LookupArea(address);
946					kprintf("  aspace %" B_PRId32 ", area %" B_PRId32 ": %#"
947						B_PRIxADDR " (%c%c%s%s)\n", addressSpace->ID(),
948						area != NULL ? area->id : -1, address,
949						(flags & B_KERNEL_READ_AREA) != 0 ? 'r' : '-',
950						(flags & B_KERNEL_WRITE_AREA) != 0 ? 'w' : '-',
951						(flags & PAGE_MODIFIED) != 0 ? " modified" : "",
952						(flags & PAGE_ACCESSED) != 0 ? " accessed" : "");
953				}
954			}
955			addressSpace = VMAddressSpace::DebugNext(addressSpace);
956		}
957	}
958
959	set_debug_variable("_cache", (addr_t)page->Cache());
960	#if DEBUG_PAGE_ACCESS
961		set_debug_variable("_accessor", page->accessing_thread);
962	#endif
963
964	return 0;
965}
966
967
968static int
969dump_page_queue(int argc, char **argv)
970{
971	struct VMPageQueue *queue;
972
973	if (argc < 2) {
974		kprintf("usage: page_queue <address/name> [list]\n");
975		return 0;
976	}
977
978	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
979		queue = (VMPageQueue*)strtoul(argv[1], NULL, 16);
980	else if (!strcmp(argv[1], "free"))
981		queue = &sFreePageQueue;
982	else if (!strcmp(argv[1], "clear"))
983		queue = &sClearPageQueue;
984	else if (!strcmp(argv[1], "modified"))
985		queue = &sModifiedPageQueue;
986	else if (!strcmp(argv[1], "active"))
987		queue = &sActivePageQueue;
988	else if (!strcmp(argv[1], "inactive"))
989		queue = &sInactivePageQueue;
990	else if (!strcmp(argv[1], "cached"))
991		queue = &sCachedPageQueue;
992	else {
993		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
994		return 0;
995	}
996
997	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %"
998		B_PRIuPHYSADDR "\n", queue, queue->Head(), queue->Tail(),
999		queue->Count());
1000
1001	if (argc == 3) {
1002		struct vm_page *page = queue->Head();
1003
1004		kprintf("page        cache       type       state  wired  usage\n");
1005		for (page_num_t i = 0; page; i++, page = queue->Next(page)) {
1006			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->Cache(),
1007				vm_cache_type_to_string(page->Cache()->type),
1008				page_state_to_string(page->State()),
1009				page->WiredCount(), page->usage_count);
1010		}
1011	}
1012	return 0;
1013}
1014
1015
1016static int
1017dump_page_stats(int argc, char **argv)
1018{
1019	page_num_t swappableModified = 0;
1020	page_num_t swappableModifiedInactive = 0;
1021
1022	size_t counter[8];
1023	size_t busyCounter[8];
1024	memset(counter, 0, sizeof(counter));
1025	memset(busyCounter, 0, sizeof(busyCounter));
1026
1027	struct page_run {
1028		page_num_t	start;
1029		page_num_t	end;
1030
1031		page_num_t Length() const	{ return end - start; }
1032	};
1033
1034	page_run currentFreeRun = { 0, 0 };
1035	page_run currentCachedRun = { 0, 0 };
1036	page_run longestFreeRun = { 0, 0 };
1037	page_run longestCachedRun = { 0, 0 };
1038
1039	for (page_num_t i = 0; i < sNumPages; i++) {
1040		if (sPages[i].State() > 7) {
1041			panic("page %" B_PRIuPHYSADDR " at %p has invalid state!\n", i,
1042				&sPages[i]);
1043		}
1044
1045		uint32 pageState = sPages[i].State();
1046
1047		counter[pageState]++;
1048		if (sPages[i].busy)
1049			busyCounter[pageState]++;
1050
1051		if (pageState == PAGE_STATE_MODIFIED
1052			&& sPages[i].Cache() != NULL
1053			&& sPages[i].Cache()->temporary && sPages[i].WiredCount() == 0) {
1054			swappableModified++;
1055			if (sPages[i].usage_count == 0)
1056				swappableModifiedInactive++;
1057		}
1058
1059		// track free and cached pages runs
1060		if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
1061			currentFreeRun.end = i + 1;
1062			currentCachedRun.end = i + 1;
1063		} else {
1064			if (currentFreeRun.Length() > longestFreeRun.Length())
1065				longestFreeRun = currentFreeRun;
1066			currentFreeRun.start = currentFreeRun.end = i + 1;
1067
1068			if (pageState == PAGE_STATE_CACHED) {
1069				currentCachedRun.end = i + 1;
1070			} else {
1071				if (currentCachedRun.Length() > longestCachedRun.Length())
1072					longestCachedRun = currentCachedRun;
1073				currentCachedRun.start = currentCachedRun.end = i + 1;
1074			}
1075		}
1076	}
1077
1078	kprintf("page stats:\n");
1079	kprintf("total: %" B_PRIuPHYSADDR "\n", sNumPages);
1080
1081	kprintf("active: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1082		counter[PAGE_STATE_ACTIVE], busyCounter[PAGE_STATE_ACTIVE]);
1083	kprintf("inactive: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1084		counter[PAGE_STATE_INACTIVE], busyCounter[PAGE_STATE_INACTIVE]);
1085	kprintf("cached: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1086		counter[PAGE_STATE_CACHED], busyCounter[PAGE_STATE_CACHED]);
1087	kprintf("unused: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1088		counter[PAGE_STATE_UNUSED], busyCounter[PAGE_STATE_UNUSED]);
1089	kprintf("wired: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1090		counter[PAGE_STATE_WIRED], busyCounter[PAGE_STATE_WIRED]);
1091	kprintf("modified: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1092		counter[PAGE_STATE_MODIFIED], busyCounter[PAGE_STATE_MODIFIED]);
1093	kprintf("free: %" B_PRIuSIZE "\n", counter[PAGE_STATE_FREE]);
1094	kprintf("clear: %" B_PRIuSIZE "\n", counter[PAGE_STATE_CLEAR]);
1095
1096	kprintf("unreserved free pages: %" B_PRId32 "\n", sUnreservedFreePages);
1097	kprintf("unsatisfied page reservations: %" B_PRId32 "\n",
1098		sUnsatisfiedPageReservations);
1099	kprintf("mapped pages: %" B_PRId32 "\n", gMappedPagesCount);
1100	kprintf("longest free pages run: %" B_PRIuPHYSADDR " pages (at %"
1101		B_PRIuPHYSADDR ")\n", longestFreeRun.Length(),
1102		sPages[longestFreeRun.start].physical_page_number);
1103	kprintf("longest free/cached pages run: %" B_PRIuPHYSADDR " pages (at %"
1104		B_PRIuPHYSADDR ")\n", longestCachedRun.Length(),
1105		sPages[longestCachedRun.start].physical_page_number);
1106
1107	kprintf("waiting threads:\n");
1108	for (PageReservationWaiterList::Iterator it
1109			= sPageReservationWaiters.GetIterator();
1110		PageReservationWaiter* waiter = it.Next();) {
1111		kprintf("  %6" B_PRId32 ": missing: %6" B_PRIu32
1112			", don't touch: %6" B_PRIu32 "\n", waiter->thread->id,
1113			waiter->missing, waiter->dontTouch);
1114	}
1115
1116	kprintf("\nfree queue: %p, count = %" B_PRIuPHYSADDR "\n", &sFreePageQueue,
1117		sFreePageQueue.Count());
1118	kprintf("clear queue: %p, count = %" B_PRIuPHYSADDR "\n", &sClearPageQueue,
1119		sClearPageQueue.Count());
1120	kprintf("modified queue: %p, count = %" B_PRIuPHYSADDR " (%" B_PRId32
1121		" temporary, %" B_PRIuPHYSADDR " swappable, " "inactive: %"
1122		B_PRIuPHYSADDR ")\n", &sModifiedPageQueue, sModifiedPageQueue.Count(),
1123		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
1124	kprintf("active queue: %p, count = %" B_PRIuPHYSADDR "\n",
1125		&sActivePageQueue, sActivePageQueue.Count());
1126	kprintf("inactive queue: %p, count = %" B_PRIuPHYSADDR "\n",
1127		&sInactivePageQueue, sInactivePageQueue.Count());
1128	kprintf("cached queue: %p, count = %" B_PRIuPHYSADDR "\n",
1129		&sCachedPageQueue, sCachedPageQueue.Count());
1130	return 0;
1131}
1132
1133
1134#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1135
1136static caller_info*
1137get_caller_info(addr_t caller)
1138{
1139	// find the caller info
1140	for (int32 i = 0; i < sCallerInfoCount; i++) {
1141		if (caller == sCallerInfoTable[i].caller)
1142			return &sCallerInfoTable[i];
1143	}
1144
1145	// not found, add a new entry, if there are free slots
1146	if (sCallerInfoCount >= kCallerInfoTableSize)
1147		return NULL;
1148
1149	caller_info* info = &sCallerInfoTable[sCallerInfoCount++];
1150	info->caller = caller;
1151	info->count = 0;
1152
1153	return info;
1154}
1155
1156
1157static int
1158caller_info_compare_count(const void* _a, const void* _b)
1159{
1160	const caller_info* a = (const caller_info*)_a;
1161	const caller_info* b = (const caller_info*)_b;
1162	return (int)(b->count - a->count);
1163}
1164
1165
1166static int
1167dump_page_allocations_per_caller(int argc, char** argv)
1168{
1169	bool resetAllocationInfos = false;
1170	bool printDetails = false;
1171	addr_t caller = 0;
1172
1173	for (int32 i = 1; i < argc; i++) {
1174		if (strcmp(argv[i], "-d") == 0) {
1175			uint64 callerAddress;
1176			if (++i >= argc
1177				|| !evaluate_debug_expression(argv[i], &callerAddress, true)) {
1178				print_debugger_command_usage(argv[0]);
1179				return 0;
1180			}
1181
1182			caller = callerAddress;
1183			printDetails = true;
1184		} else if (strcmp(argv[i], "-r") == 0) {
1185			resetAllocationInfos = true;
1186		} else {
1187			print_debugger_command_usage(argv[0]);
1188			return 0;
1189		}
1190	}
1191
1192	sCallerInfoCount = 0;
1193
1194	AllocationCollectorCallback collectorCallback(resetAllocationInfos);
1195	AllocationDetailPrinterCallback detailsCallback(caller);
1196	AllocationTrackingCallback& callback = printDetails
1197		? (AllocationTrackingCallback&)detailsCallback
1198		: (AllocationTrackingCallback&)collectorCallback;
1199
1200	for (page_num_t i = 0; i < sNumPages; i++)
1201		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1202
1203	if (printDetails)
1204		return 0;
1205
1206	// sort the array
1207	qsort(sCallerInfoTable, sCallerInfoCount, sizeof(caller_info),
1208		&caller_info_compare_count);
1209
1210	kprintf("%" B_PRId32 " different callers\n\n", sCallerInfoCount);
1211
1212	size_t totalAllocationCount = 0;
1213
1214	kprintf("     count      caller\n");
1215	kprintf("----------------------------------\n");
1216	for (int32 i = 0; i < sCallerInfoCount; i++) {
1217		caller_info& info = sCallerInfoTable[i];
1218		kprintf("%10" B_PRIuSIZE "  %p", info.count, (void*)info.caller);
1219
1220		const char* symbol;
1221		const char* imageName;
1222		bool exactMatch;
1223		addr_t baseAddress;
1224
1225		if (elf_debug_lookup_symbol_address(info.caller, &baseAddress, &symbol,
1226				&imageName, &exactMatch) == B_OK) {
1227			kprintf("  %s + %#" B_PRIxADDR " (%s)%s\n", symbol,
1228				info.caller - baseAddress, imageName,
1229				exactMatch ? "" : " (nearest)");
1230		} else
1231			kprintf("\n");
1232
1233		totalAllocationCount += info.count;
1234	}
1235
1236	kprintf("\ntotal page allocations: %" B_PRIuSIZE "\n",
1237		totalAllocationCount);
1238
1239	return 0;
1240}
1241
1242
1243static int
1244dump_page_allocation_infos(int argc, char** argv)
1245{
1246	page_num_t pageFilter = 0;
1247	team_id teamFilter = -1;
1248	thread_id threadFilter = -1;
1249	bool printStackTraces = false;
1250
1251	for (int32 i = 1; i < argc; i++) {
1252		if (strcmp(argv[i], "--stacktrace") == 0)
1253			printStackTraces = true;
1254		else if (strcmp(argv[i], "-p") == 0) {
1255			uint64 pageNumber;
1256			if (++i >= argc
1257				|| !evaluate_debug_expression(argv[i], &pageNumber, true)) {
1258				print_debugger_command_usage(argv[0]);
1259				return 0;
1260			}
1261
1262			pageFilter = pageNumber;
1263		} else if (strcmp(argv[i], "--team") == 0) {
1264			uint64 team;
1265			if (++i >= argc
1266				|| !evaluate_debug_expression(argv[i], &team, true)) {
1267				print_debugger_command_usage(argv[0]);
1268				return 0;
1269			}
1270
1271			teamFilter = team;
1272		} else if (strcmp(argv[i], "--thread") == 0) {
1273			uint64 thread;
1274			if (++i >= argc
1275				|| !evaluate_debug_expression(argv[i], &thread, true)) {
1276				print_debugger_command_usage(argv[0]);
1277				return 0;
1278			}
1279
1280			threadFilter = thread;
1281		} else {
1282			print_debugger_command_usage(argv[0]);
1283			return 0;
1284		}
1285	}
1286
1287	AllocationInfoPrinterCallback callback(printStackTraces, pageFilter,
1288		teamFilter, threadFilter);
1289
1290	for (page_num_t i = 0; i < sNumPages; i++)
1291		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1292
1293	return 0;
1294}
1295
1296#endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1297
1298
1299#ifdef TRACK_PAGE_USAGE_STATS
1300
1301static void
1302track_page_usage(vm_page* page)
1303{
1304	if (page->WiredCount() == 0) {
1305		sNextPageUsage[(int32)page->usage_count + 128]++;
1306		sNextPageUsagePageCount++;
1307	}
1308}
1309
1310
1311static void
1312update_page_usage_stats()
1313{
1314	std::swap(sPageUsage, sNextPageUsage);
1315	sPageUsagePageCount = sNextPageUsagePageCount;
1316
1317	memset(sNextPageUsage, 0, sizeof(page_num_t) * 256);
1318	sNextPageUsagePageCount = 0;
1319
1320	// compute average
1321	if (sPageUsagePageCount > 0) {
1322		int64 sum = 0;
1323		for (int32 i = 0; i < 256; i++)
1324			sum += (int64)sPageUsage[i] * (i - 128);
1325
1326		TRACE_DAEMON("average page usage: %f (%lu pages)\n",
1327			(float)sum / sPageUsagePageCount, sPageUsagePageCount);
1328	}
1329}
1330
1331
1332static int
1333dump_page_usage_stats(int argc, char** argv)
1334{
1335	kprintf("distribution of page usage counts (%lu pages):",
1336		sPageUsagePageCount);
1337
1338	int64 sum = 0;
1339	for (int32 i = 0; i < 256; i++) {
1340		if (i % 8 == 0)
1341			kprintf("\n%4ld:", i - 128);
1342
1343		int64 count = sPageUsage[i];
1344		sum += count * (i - 128);
1345
1346		kprintf("  %9llu", count);
1347	}
1348
1349	kprintf("\n\n");
1350
1351	kprintf("average usage count: %f\n",
1352		sPageUsagePageCount > 0 ? (float)sum / sPageUsagePageCount : 0);
1353
1354	return 0;
1355}
1356
1357#endif	// TRACK_PAGE_USAGE_STATS
1358
1359
1360// #pragma mark - vm_page
1361
1362
1363inline void
1364vm_page::InitState(uint8 newState)
1365{
1366	state = newState;
1367}
1368
1369
1370inline void
1371vm_page::SetState(uint8 newState)
1372{
1373	TPS(SetPageState(this, newState));
1374
1375	state = newState;
1376}
1377
1378
1379// #pragma mark -
1380
1381
1382static void
1383get_page_stats(page_stats& _pageStats)
1384{
1385	_pageStats.totalFreePages = sUnreservedFreePages;
1386	_pageStats.cachedPages = sCachedPageQueue.Count();
1387	_pageStats.unsatisfiedReservations = sUnsatisfiedPageReservations;
1388	// TODO: We don't get an actual snapshot here!
1389}
1390
1391
1392static bool
1393do_active_paging(const page_stats& pageStats)
1394{
1395	return pageStats.totalFreePages + pageStats.cachedPages
1396		< pageStats.unsatisfiedReservations
1397			+ (int32)sFreeOrCachedPagesTarget;
1398}
1399
1400
1401/*!	Reserves as many pages as possible from \c sUnreservedFreePages up to
1402	\a count. Doesn't touch the last \a dontTouch pages of
1403	\c sUnreservedFreePages, though.
1404	\return The number of actually reserved pages.
1405*/
1406static uint32
1407reserve_some_pages(uint32 count, uint32 dontTouch)
1408{
1409	while (true) {
1410		int32 freePages = atomic_get(&sUnreservedFreePages);
1411		if (freePages <= (int32)dontTouch)
1412			return 0;
1413
1414		int32 toReserve = std::min(count, freePages - dontTouch);
1415		if (atomic_test_and_set(&sUnreservedFreePages,
1416					freePages - toReserve, freePages)
1417				== freePages) {
1418			return toReserve;
1419		}
1420
1421		// the count changed in the meantime -- retry
1422	}
1423}
1424
1425
1426static void
1427wake_up_page_reservation_waiters()
1428{
1429	MutexLocker pageDeficitLocker(sPageDeficitLock);
1430
1431	// TODO: If this is a low priority thread, we might want to disable
1432	// interrupts or otherwise ensure that we aren't unscheduled. Otherwise
1433	// high priority threads wait be kept waiting while a medium priority thread
1434	// prevents us from running.
1435
1436	while (PageReservationWaiter* waiter = sPageReservationWaiters.Head()) {
1437		int32 reserved = reserve_some_pages(waiter->missing,
1438			waiter->dontTouch);
1439		if (reserved == 0)
1440			return;
1441
1442		atomic_add(&sUnsatisfiedPageReservations, -reserved);
1443		waiter->missing -= reserved;
1444
1445		if (waiter->missing > 0)
1446			return;
1447
1448		sPageReservationWaiters.Remove(waiter);
1449
1450		thread_unblock(waiter->thread, B_OK);
1451	}
1452}
1453
1454
1455static inline void
1456unreserve_pages(uint32 count)
1457{
1458	atomic_add(&sUnreservedFreePages, count);
1459	if (atomic_get(&sUnsatisfiedPageReservations) != 0)
1460		wake_up_page_reservation_waiters();
1461}
1462
1463
1464static void
1465free_page(vm_page* page, bool clear)
1466{
1467	DEBUG_PAGE_ACCESS_CHECK(page);
1468
1469	PAGE_ASSERT(page, !page->IsMapped());
1470
1471	VMPageQueue* fromQueue;
1472
1473	switch (page->State()) {
1474		case PAGE_STATE_ACTIVE:
1475			fromQueue = &sActivePageQueue;
1476			break;
1477		case PAGE_STATE_INACTIVE:
1478			fromQueue = &sInactivePageQueue;
1479			break;
1480		case PAGE_STATE_MODIFIED:
1481			fromQueue = &sModifiedPageQueue;
1482			break;
1483		case PAGE_STATE_CACHED:
1484			fromQueue = &sCachedPageQueue;
1485			break;
1486		case PAGE_STATE_FREE:
1487		case PAGE_STATE_CLEAR:
1488			panic("free_page(): page %p already free", page);
1489			return;
1490		case PAGE_STATE_WIRED:
1491		case PAGE_STATE_UNUSED:
1492			fromQueue = NULL;
1493			break;
1494		default:
1495			panic("free_page(): page %p in invalid state %d",
1496				page, page->State());
1497			return;
1498	}
1499
1500	if (page->CacheRef() != NULL)
1501		panic("to be freed page %p has cache", page);
1502	if (page->IsMapped())
1503		panic("to be freed page %p has mappings", page);
1504
1505	if (fromQueue != NULL)
1506		fromQueue->RemoveUnlocked(page);
1507
1508	TA(FreePage(page->physical_page_number));
1509
1510#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1511	page->allocation_tracking_info.Clear();
1512#endif
1513
1514	ReadLocker locker(sFreePageQueuesLock);
1515
1516	DEBUG_PAGE_ACCESS_END(page);
1517
1518	if (clear) {
1519		page->SetState(PAGE_STATE_CLEAR);
1520		sClearPageQueue.PrependUnlocked(page);
1521	} else {
1522		page->SetState(PAGE_STATE_FREE);
1523		sFreePageQueue.PrependUnlocked(page);
1524	}
1525
1526	locker.Unlock();
1527}
1528
1529
1530/*!	The caller must make sure that no-one else tries to change the page's state
1531	while the function is called. If the page has a cache, this can be done by
1532	locking the cache.
1533*/
1534static void
1535set_page_state(vm_page *page, int pageState)
1536{
1537	DEBUG_PAGE_ACCESS_CHECK(page);
1538
1539	if (pageState == page->State())
1540		return;
1541
1542	VMPageQueue* fromQueue;
1543
1544	switch (page->State()) {
1545		case PAGE_STATE_ACTIVE:
1546			fromQueue = &sActivePageQueue;
1547			break;
1548		case PAGE_STATE_INACTIVE:
1549			fromQueue = &sInactivePageQueue;
1550			break;
1551		case PAGE_STATE_MODIFIED:
1552			fromQueue = &sModifiedPageQueue;
1553			break;
1554		case PAGE_STATE_CACHED:
1555			fromQueue = &sCachedPageQueue;
1556			break;
1557		case PAGE_STATE_FREE:
1558		case PAGE_STATE_CLEAR:
1559			panic("set_page_state(): page %p is free/clear", page);
1560			return;
1561		case PAGE_STATE_WIRED:
1562		case PAGE_STATE_UNUSED:
1563			fromQueue = NULL;
1564			break;
1565		default:
1566			panic("set_page_state(): page %p in invalid state %d",
1567				page, page->State());
1568			return;
1569	}
1570
1571	VMPageQueue* toQueue;
1572
1573	switch (pageState) {
1574		case PAGE_STATE_ACTIVE:
1575			toQueue = &sActivePageQueue;
1576			break;
1577		case PAGE_STATE_INACTIVE:
1578			toQueue = &sInactivePageQueue;
1579			break;
1580		case PAGE_STATE_MODIFIED:
1581			toQueue = &sModifiedPageQueue;
1582			break;
1583		case PAGE_STATE_CACHED:
1584			PAGE_ASSERT(page, !page->IsMapped());
1585			PAGE_ASSERT(page, !page->modified);
1586			toQueue = &sCachedPageQueue;
1587			break;
1588		case PAGE_STATE_FREE:
1589		case PAGE_STATE_CLEAR:
1590			panic("set_page_state(): target state is free/clear");
1591			return;
1592		case PAGE_STATE_WIRED:
1593		case PAGE_STATE_UNUSED:
1594			toQueue = NULL;
1595			break;
1596		default:
1597			panic("set_page_state(): invalid target state %d", pageState);
1598			return;
1599	}
1600
1601	VMCache* cache = page->Cache();
1602	if (cache != NULL && cache->temporary) {
1603		if (pageState == PAGE_STATE_MODIFIED)
1604			atomic_add(&sModifiedTemporaryPages, 1);
1605		else if (page->State() == PAGE_STATE_MODIFIED)
1606			atomic_add(&sModifiedTemporaryPages, -1);
1607	}
1608
1609	// move the page
1610	if (toQueue == fromQueue) {
1611		// Note: Theoretically we are required to lock when changing the page
1612		// state, even if we don't change the queue. We actually don't have to
1613		// do this, though, since only for the active queue there are different
1614		// page states and active pages have a cache that must be locked at
1615		// this point. So we rely on the fact that everyone must lock the cache
1616		// before trying to change/interpret the page state.
1617		PAGE_ASSERT(page, cache != NULL);
1618		cache->AssertLocked();
1619		page->SetState(pageState);
1620	} else {
1621		if (fromQueue != NULL)
1622			fromQueue->RemoveUnlocked(page);
1623
1624		page->SetState(pageState);
1625
1626		if (toQueue != NULL)
1627			toQueue->AppendUnlocked(page);
1628	}
1629}
1630
1631
1632/*! Moves a previously modified page into a now appropriate queue.
1633	The page queues must not be locked.
1634*/
1635static void
1636move_page_to_appropriate_queue(vm_page *page)
1637{
1638	DEBUG_PAGE_ACCESS_CHECK(page);
1639
1640	// Note, this logic must be in sync with what the page daemon does.
1641	int32 state;
1642	if (page->IsMapped())
1643		state = PAGE_STATE_ACTIVE;
1644	else if (page->modified)
1645		state = PAGE_STATE_MODIFIED;
1646	else
1647		state = PAGE_STATE_CACHED;
1648
1649// TODO: If free + cached pages are low, we might directly want to free the
1650// page.
1651	set_page_state(page, state);
1652}
1653
1654
1655static void
1656clear_page(struct vm_page *page)
1657{
1658	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
1659		B_PAGE_SIZE);
1660}
1661
1662
1663static status_t
1664mark_page_range_in_use(page_num_t startPage, page_num_t length, bool wired)
1665{
1666	TRACE(("mark_page_range_in_use: start %#" B_PRIxPHYSADDR ", len %#"
1667		B_PRIxPHYSADDR "\n", startPage, length));
1668
1669	if (sPhysicalPageOffset > startPage) {
1670		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1671			"): start page is before free list\n", startPage, length);
1672		if (sPhysicalPageOffset - startPage >= length)
1673			return B_OK;
1674		length -= sPhysicalPageOffset - startPage;
1675		startPage = sPhysicalPageOffset;
1676	}
1677
1678	startPage -= sPhysicalPageOffset;
1679
1680	if (startPage + length > sNumPages) {
1681		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1682			"): range would extend past free list\n", startPage, length);
1683		if (startPage >= sNumPages)
1684			return B_OK;
1685		length = sNumPages - startPage;
1686	}
1687
1688	WriteLocker locker(sFreePageQueuesLock);
1689
1690	for (page_num_t i = 0; i < length; i++) {
1691		vm_page *page = &sPages[startPage + i];
1692		switch (page->State()) {
1693			case PAGE_STATE_FREE:
1694			case PAGE_STATE_CLEAR:
1695			{
1696// TODO: This violates the page reservation policy, since we remove pages from
1697// the free/clear queues without having reserved them before. This should happen
1698// in the early boot process only, though.
1699				DEBUG_PAGE_ACCESS_START(page);
1700				VMPageQueue& queue = page->State() == PAGE_STATE_FREE
1701					? sFreePageQueue : sClearPageQueue;
1702				queue.Remove(page);
1703				page->SetState(wired ? PAGE_STATE_WIRED : PAGE_STATE_UNUSED);
1704				page->busy = false;
1705				atomic_add(&sUnreservedFreePages, -1);
1706				DEBUG_PAGE_ACCESS_END(page);
1707				break;
1708			}
1709			case PAGE_STATE_WIRED:
1710			case PAGE_STATE_UNUSED:
1711				break;
1712			case PAGE_STATE_ACTIVE:
1713			case PAGE_STATE_INACTIVE:
1714			case PAGE_STATE_MODIFIED:
1715			case PAGE_STATE_CACHED:
1716			default:
1717				// uh
1718				dprintf("mark_page_range_in_use: page %#" B_PRIxPHYSADDR
1719					" in non-free state %d!\n", startPage + i, page->State());
1720				break;
1721		}
1722	}
1723
1724	return B_OK;
1725}
1726
1727
1728/*!
1729	This is a background thread that wakes up every now and then (every 100ms)
1730	and moves some pages from the free queue over to the clear queue.
1731	Given enough time, it will clear out all pages from the free queue - we
1732	could probably slow it down after having reached a certain threshold.
1733*/
1734static int32
1735page_scrubber(void *unused)
1736{
1737	(void)(unused);
1738
1739	TRACE(("page_scrubber starting...\n"));
1740
1741	for (;;) {
1742		snooze(100000); // 100ms
1743
1744		if (sFreePageQueue.Count() == 0
1745				|| atomic_get(&sUnreservedFreePages)
1746					< (int32)sFreePagesTarget) {
1747			continue;
1748		}
1749
1750		// Since we temporarily remove pages from the free pages reserve,
1751		// we must make sure we don't cause a violation of the page
1752		// reservation warranty. The following is usually stricter than
1753		// necessary, because we don't have information on how many of the
1754		// reserved pages have already been allocated.
1755		int32 reserved = reserve_some_pages(SCRUB_SIZE,
1756			kPageReserveForPriority[VM_PRIORITY_USER]);
1757		if (reserved == 0)
1758			continue;
1759
1760		// get some pages from the free queue
1761		ReadLocker locker(sFreePageQueuesLock);
1762
1763		vm_page *page[SCRUB_SIZE];
1764		int32 scrubCount = 0;
1765		for (int32 i = 0; i < reserved; i++) {
1766			page[i] = sFreePageQueue.RemoveHeadUnlocked();
1767			if (page[i] == NULL)
1768				break;
1769
1770			DEBUG_PAGE_ACCESS_START(page[i]);
1771
1772			page[i]->SetState(PAGE_STATE_ACTIVE);
1773			page[i]->busy = true;
1774			scrubCount++;
1775		}
1776
1777		locker.Unlock();
1778
1779		if (scrubCount == 0) {
1780			unreserve_pages(reserved);
1781			continue;
1782		}
1783
1784		TA(ScrubbingPages(scrubCount));
1785
1786		// clear them
1787		for (int32 i = 0; i < scrubCount; i++)
1788			clear_page(page[i]);
1789
1790		locker.Lock();
1791
1792		// and put them into the clear queue
1793		for (int32 i = 0; i < scrubCount; i++) {
1794			page[i]->SetState(PAGE_STATE_CLEAR);
1795			page[i]->busy = false;
1796			DEBUG_PAGE_ACCESS_END(page[i]);
1797			sClearPageQueue.PrependUnlocked(page[i]);
1798		}
1799
1800		locker.Unlock();
1801
1802		unreserve_pages(reserved);
1803
1804		TA(ScrubbedPages(scrubCount));
1805	}
1806
1807	return 0;
1808}
1809
1810
1811static void
1812init_page_marker(vm_page &marker)
1813{
1814	marker.SetCacheRef(NULL);
1815	marker.InitState(PAGE_STATE_UNUSED);
1816	marker.busy = true;
1817#if DEBUG_PAGE_QUEUE
1818	marker.queue = NULL;
1819#endif
1820#if DEBUG_PAGE_ACCESS
1821	marker.accessing_thread = thread_get_current_thread_id();
1822#endif
1823}
1824
1825
1826static void
1827remove_page_marker(struct vm_page &marker)
1828{
1829	DEBUG_PAGE_ACCESS_CHECK(&marker);
1830
1831	if (marker.State() < PAGE_STATE_FIRST_UNQUEUED)
1832		sPageQueues[marker.State()].RemoveUnlocked(&marker);
1833
1834	marker.SetState(PAGE_STATE_UNUSED);
1835}
1836
1837
1838static vm_page*
1839next_modified_page(page_num_t& maxPagesToSee)
1840{
1841	InterruptsSpinLocker locker(sModifiedPageQueue.GetLock());
1842
1843	while (maxPagesToSee > 0) {
1844		vm_page* page = sModifiedPageQueue.Head();
1845		if (page == NULL)
1846			return NULL;
1847
1848		sModifiedPageQueue.Requeue(page, true);
1849
1850		maxPagesToSee--;
1851
1852		if (!page->busy)
1853			return page;
1854	}
1855
1856	return NULL;
1857}
1858
1859
1860// #pragma mark -
1861
1862
1863class PageWriteTransfer;
1864class PageWriteWrapper;
1865
1866
1867class PageWriterRun {
1868public:
1869	status_t Init(uint32 maxPages);
1870
1871	void PrepareNextRun();
1872	void AddPage(vm_page* page);
1873	uint32 Go();
1874
1875	void PageWritten(PageWriteTransfer* transfer, status_t status,
1876		bool partialTransfer, size_t bytesTransferred);
1877
1878private:
1879	uint32				fMaxPages;
1880	uint32				fWrapperCount;
1881	uint32				fTransferCount;
1882	int32				fPendingTransfers;
1883	PageWriteWrapper*	fWrappers;
1884	PageWriteTransfer*	fTransfers;
1885	ConditionVariable	fAllFinishedCondition;
1886};
1887
1888
1889class PageWriteTransfer : public AsyncIOCallback {
1890public:
1891	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1892	bool AddPage(vm_page* page);
1893
1894	status_t Schedule(uint32 flags);
1895
1896	void SetStatus(status_t status, size_t transferred);
1897
1898	status_t Status() const	{ return fStatus; }
1899	struct VMCache* Cache() const { return fCache; }
1900	uint32 PageCount() const { return fPageCount; }
1901
1902	virtual void IOFinished(status_t status, bool partialTransfer,
1903		generic_size_t bytesTransferred);
1904private:
1905	PageWriterRun*		fRun;
1906	struct VMCache*		fCache;
1907	off_t				fOffset;
1908	uint32				fPageCount;
1909	int32				fMaxPages;
1910	status_t			fStatus;
1911	uint32				fVecCount;
1912	generic_io_vec		fVecs[32]; // TODO: make dynamic/configurable
1913};
1914
1915
1916class PageWriteWrapper {
1917public:
1918	PageWriteWrapper();
1919	~PageWriteWrapper();
1920	void SetTo(vm_page* page);
1921	bool Done(status_t result);
1922
1923private:
1924	vm_page*			fPage;
1925	struct VMCache*		fCache;
1926	bool				fIsActive;
1927};
1928
1929
1930PageWriteWrapper::PageWriteWrapper()
1931	:
1932	fIsActive(false)
1933{
1934}
1935
1936
1937PageWriteWrapper::~PageWriteWrapper()
1938{
1939	if (fIsActive)
1940		panic("page write wrapper going out of scope but isn't completed");
1941}
1942
1943
1944/*!	The page's cache must be locked.
1945*/
1946void
1947PageWriteWrapper::SetTo(vm_page* page)
1948{
1949	DEBUG_PAGE_ACCESS_CHECK(page);
1950
1951	if (page->busy)
1952		panic("setting page write wrapper to busy page");
1953
1954	if (fIsActive)
1955		panic("re-setting page write wrapper that isn't completed");
1956
1957	fPage = page;
1958	fCache = page->Cache();
1959	fIsActive = true;
1960
1961	fPage->busy = true;
1962	fPage->busy_writing = true;
1963
1964	// We have a modified page -- however, while we're writing it back,
1965	// the page might still be mapped. In order not to lose any changes to the
1966	// page, we mark it clean before actually writing it back; if
1967	// writing the page fails for some reason, we'll just keep it in the
1968	// modified page list, but that should happen only rarely.
1969
1970	// If the page is changed after we cleared the dirty flag, but before we
1971	// had the chance to write it back, then we'll write it again later -- that
1972	// will probably not happen that often, though.
1973
1974	vm_clear_map_flags(fPage, PAGE_MODIFIED);
1975}
1976
1977
1978/*!	The page's cache must be locked.
1979	The page queues must not be locked.
1980	\return \c true if the page was written successfully respectively could be
1981		handled somehow, \c false otherwise.
1982*/
1983bool
1984PageWriteWrapper::Done(status_t result)
1985{
1986	if (!fIsActive)
1987		panic("completing page write wrapper that is not active");
1988
1989	DEBUG_PAGE_ACCESS_START(fPage);
1990
1991	fPage->busy = false;
1992		// Set unbusy and notify later by hand, since we might free the page.
1993
1994	bool success = true;
1995
1996	if (result == B_OK) {
1997		// put it into the active/inactive queue
1998		move_page_to_appropriate_queue(fPage);
1999		fPage->busy_writing = false;
2000		DEBUG_PAGE_ACCESS_END(fPage);
2001	} else {
2002		// Writing the page failed. One reason would be that the cache has been
2003		// shrunk and the page does no longer belong to the file. Otherwise the
2004		// actual I/O failed, in which case we'll simply keep the page modified.
2005
2006		if (!fPage->busy_writing) {
2007			// The busy_writing flag was cleared. That means the cache has been
2008			// shrunk while we were trying to write the page and we have to free
2009			// it now.
2010			vm_remove_all_page_mappings(fPage);
2011// TODO: Unmapping should already happen when resizing the cache!
2012			fCache->RemovePage(fPage);
2013			free_page(fPage, false);
2014			unreserve_pages(1);
2015		} else {
2016			// Writing the page failed -- mark the page modified and move it to
2017			// an appropriate queue other than the modified queue, so we don't
2018			// keep trying to write it over and over again. We keep
2019			// non-temporary pages in the modified queue, though, so they don't
2020			// get lost in the inactive queue.
2021			dprintf("PageWriteWrapper: Failed to write page %p: %s\n", fPage,
2022				strerror(result));
2023
2024			fPage->modified = true;
2025			if (!fCache->temporary)
2026				set_page_state(fPage, PAGE_STATE_MODIFIED);
2027			else if (fPage->IsMapped())
2028				set_page_state(fPage, PAGE_STATE_ACTIVE);
2029			else
2030				set_page_state(fPage, PAGE_STATE_INACTIVE);
2031
2032			fPage->busy_writing = false;
2033			DEBUG_PAGE_ACCESS_END(fPage);
2034
2035			success = false;
2036		}
2037	}
2038
2039	fCache->NotifyPageEvents(fPage, PAGE_EVENT_NOT_BUSY);
2040	fIsActive = false;
2041
2042	return success;
2043}
2044
2045
2046/*!	The page's cache must be locked.
2047*/
2048void
2049PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
2050{
2051	fRun = run;
2052	fCache = page->Cache();
2053	fOffset = page->cache_offset;
2054	fPageCount = 1;
2055	fMaxPages = maxPages;
2056	fStatus = B_OK;
2057
2058	fVecs[0].base = (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2059	fVecs[0].length = B_PAGE_SIZE;
2060	fVecCount = 1;
2061}
2062
2063
2064/*!	The page's cache must be locked.
2065*/
2066bool
2067PageWriteTransfer::AddPage(vm_page* page)
2068{
2069	if (page->Cache() != fCache
2070		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
2071		return false;
2072
2073	phys_addr_t nextBase = fVecs[fVecCount - 1].base
2074		+ fVecs[fVecCount - 1].length;
2075
2076	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2077		&& (off_t)page->cache_offset == fOffset + fPageCount) {
2078		// append to last iovec
2079		fVecs[fVecCount - 1].length += B_PAGE_SIZE;
2080		fPageCount++;
2081		return true;
2082	}
2083
2084	nextBase = fVecs[0].base - B_PAGE_SIZE;
2085	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2086		&& (off_t)page->cache_offset == fOffset - 1) {
2087		// prepend to first iovec and adjust offset
2088		fVecs[0].base = nextBase;
2089		fVecs[0].length += B_PAGE_SIZE;
2090		fOffset = page->cache_offset;
2091		fPageCount++;
2092		return true;
2093	}
2094
2095	if (((off_t)page->cache_offset == fOffset + fPageCount
2096			|| (off_t)page->cache_offset == fOffset - 1)
2097		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
2098		// not physically contiguous or not in the right order
2099		uint32 vectorIndex;
2100		if ((off_t)page->cache_offset < fOffset) {
2101			// we are pre-pending another vector, move the other vecs
2102			for (uint32 i = fVecCount; i > 0; i--)
2103				fVecs[i] = fVecs[i - 1];
2104
2105			fOffset = page->cache_offset;
2106			vectorIndex = 0;
2107		} else
2108			vectorIndex = fVecCount;
2109
2110		fVecs[vectorIndex].base
2111			= (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2112		fVecs[vectorIndex].length = B_PAGE_SIZE;
2113
2114		fVecCount++;
2115		fPageCount++;
2116		return true;
2117	}
2118
2119	return false;
2120}
2121
2122
2123status_t
2124PageWriteTransfer::Schedule(uint32 flags)
2125{
2126	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
2127	generic_size_t writeLength = (phys_size_t)fPageCount << PAGE_SHIFT;
2128
2129	if (fRun != NULL) {
2130		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
2131			flags | B_PHYSICAL_IO_REQUEST, this);
2132	}
2133
2134	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
2135		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
2136
2137	SetStatus(status, writeLength);
2138	return fStatus;
2139}
2140
2141
2142void
2143PageWriteTransfer::SetStatus(status_t status, size_t transferred)
2144{
2145	// only succeed if all pages up to the last one have been written fully
2146	// and the last page has at least been written partially
2147	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
2148		status = B_ERROR;
2149
2150	fStatus = status;
2151}
2152
2153
2154void
2155PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
2156	generic_size_t bytesTransferred)
2157{
2158	SetStatus(status, bytesTransferred);
2159	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
2160}
2161
2162
2163status_t
2164PageWriterRun::Init(uint32 maxPages)
2165{
2166	fMaxPages = maxPages;
2167	fWrapperCount = 0;
2168	fTransferCount = 0;
2169	fPendingTransfers = 0;
2170
2171	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
2172	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
2173	if (fWrappers == NULL || fTransfers == NULL)
2174		return B_NO_MEMORY;
2175
2176	return B_OK;
2177}
2178
2179
2180void
2181PageWriterRun::PrepareNextRun()
2182{
2183	fWrapperCount = 0;
2184	fTransferCount = 0;
2185	fPendingTransfers = 0;
2186}
2187
2188
2189/*!	The page's cache must be locked.
2190*/
2191void
2192PageWriterRun::AddPage(vm_page* page)
2193{
2194	fWrappers[fWrapperCount++].SetTo(page);
2195
2196	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
2197		fTransfers[fTransferCount++].SetTo(this, page,
2198			page->Cache()->MaxPagesPerAsyncWrite());
2199	}
2200}
2201
2202
2203/*!	Writes all pages previously added.
2204	\return The number of pages that could not be written or otherwise handled.
2205*/
2206uint32
2207PageWriterRun::Go()
2208{
2209	atomic_set(&fPendingTransfers, fTransferCount);
2210
2211	fAllFinishedCondition.Init(this, "page writer wait for I/O");
2212	ConditionVariableEntry waitEntry;
2213	fAllFinishedCondition.Add(&waitEntry);
2214
2215	// schedule writes
2216	for (uint32 i = 0; i < fTransferCount; i++)
2217		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
2218
2219	// wait until all pages have been written
2220	waitEntry.Wait();
2221
2222	// mark pages depending on whether they could be written or not
2223
2224	uint32 failedPages = 0;
2225	uint32 wrapperIndex = 0;
2226	for (uint32 i = 0; i < fTransferCount; i++) {
2227		PageWriteTransfer& transfer = fTransfers[i];
2228		transfer.Cache()->Lock();
2229
2230		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2231			if (!fWrappers[wrapperIndex++].Done(transfer.Status()))
2232				failedPages++;
2233		}
2234
2235		transfer.Cache()->Unlock();
2236	}
2237
2238	ASSERT(wrapperIndex == fWrapperCount);
2239
2240	for (uint32 i = 0; i < fTransferCount; i++) {
2241		PageWriteTransfer& transfer = fTransfers[i];
2242		struct VMCache* cache = transfer.Cache();
2243
2244		// We've acquired a references for each page
2245		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2246			// We release the cache references after all pages were made
2247			// unbusy again - otherwise releasing a vnode could deadlock.
2248			cache->ReleaseStoreRef();
2249			cache->ReleaseRef();
2250		}
2251	}
2252
2253	return failedPages;
2254}
2255
2256
2257void
2258PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
2259	bool partialTransfer, size_t bytesTransferred)
2260{
2261	if (atomic_add(&fPendingTransfers, -1) == 1)
2262		fAllFinishedCondition.NotifyAll();
2263}
2264
2265
2266/*!	The page writer continuously takes some pages from the modified
2267	queue, writes them back, and moves them back to the active queue.
2268	It runs in its own thread, and is only there to keep the number
2269	of modified pages low, so that more pages can be reused with
2270	fewer costs.
2271*/
2272status_t
2273page_writer(void* /*unused*/)
2274{
2275	const uint32 kNumPages = 256;
2276#ifdef TRACE_VM_PAGE
2277	uint32 writtenPages = 0;
2278	bigtime_t lastWrittenTime = 0;
2279	bigtime_t pageCollectionTime = 0;
2280	bigtime_t pageWritingTime = 0;
2281#endif
2282
2283	PageWriterRun run;
2284	if (run.Init(kNumPages) != B_OK) {
2285		panic("page writer: Failed to init PageWriterRun!");
2286		return B_ERROR;
2287	}
2288
2289	page_num_t pagesSinceLastSuccessfulWrite = 0;
2290
2291	while (true) {
2292// TODO: Maybe wait shorter when memory is low!
2293		if (sModifiedPageQueue.Count() < kNumPages) {
2294			sPageWriterCondition.Wait(3000000, true);
2295				// all 3 seconds when no one triggers us
2296		}
2297
2298		page_num_t modifiedPages = sModifiedPageQueue.Count();
2299		if (modifiedPages == 0)
2300			continue;
2301
2302		if (modifiedPages <= pagesSinceLastSuccessfulWrite) {
2303			// We ran through the whole queue without being able to write a
2304			// single page. Take a break.
2305			snooze(500000);
2306			pagesSinceLastSuccessfulWrite = 0;
2307		}
2308
2309#if ENABLE_SWAP_SUPPORT
2310		page_stats pageStats;
2311		get_page_stats(pageStats);
2312		bool activePaging = do_active_paging(pageStats);
2313#endif
2314
2315		// depending on how urgent it becomes to get pages to disk, we adjust
2316		// our I/O priority
2317		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
2318		int32 ioPriority = B_IDLE_PRIORITY;
2319		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
2320			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
2321			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
2322		} else {
2323			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
2324				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
2325		}
2326
2327		thread_set_io_priority(ioPriority);
2328
2329		uint32 numPages = 0;
2330		run.PrepareNextRun();
2331
2332		// TODO: make this laptop friendly, too (ie. only start doing
2333		// something if someone else did something or there is really
2334		// enough to do).
2335
2336		// collect pages to be written
2337#ifdef TRACE_VM_PAGE
2338		pageCollectionTime -= system_time();
2339#endif
2340
2341		page_num_t maxPagesToSee = modifiedPages;
2342
2343		while (numPages < kNumPages && maxPagesToSee > 0) {
2344			vm_page *page = next_modified_page(maxPagesToSee);
2345			if (page == NULL)
2346				break;
2347
2348			PageCacheLocker cacheLocker(page, false);
2349			if (!cacheLocker.IsLocked())
2350				continue;
2351
2352			VMCache *cache = page->Cache();
2353
2354			// If the page is busy or its state has changed while we were
2355			// locking the cache, just ignore it.
2356			if (page->busy || page->State() != PAGE_STATE_MODIFIED)
2357				continue;
2358
2359			DEBUG_PAGE_ACCESS_START(page);
2360
2361			// Don't write back wired (locked) pages.
2362			if (page->WiredCount() > 0) {
2363				set_page_state(page, PAGE_STATE_ACTIVE);
2364				DEBUG_PAGE_ACCESS_END(page);
2365				continue;
2366			}
2367
2368			// Write back temporary pages only when we're actively paging.
2369			if (cache->temporary
2370#if ENABLE_SWAP_SUPPORT
2371				&& (!activePaging
2372					|| !cache->CanWritePage(
2373							(off_t)page->cache_offset << PAGE_SHIFT))
2374#endif
2375				) {
2376				// We can't/don't want to do anything with this page, so move it
2377				// to one of the other queues.
2378				if (page->mappings.IsEmpty())
2379					set_page_state(page, PAGE_STATE_INACTIVE);
2380				else
2381					set_page_state(page, PAGE_STATE_ACTIVE);
2382
2383				DEBUG_PAGE_ACCESS_END(page);
2384				continue;
2385			}
2386
2387			// We need our own reference to the store, as it might currently be
2388			// destroyed.
2389			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
2390				DEBUG_PAGE_ACCESS_END(page);
2391				cacheLocker.Unlock();
2392				thread_yield();
2393				continue;
2394			}
2395
2396			run.AddPage(page);
2397				// TODO: We're possibly adding pages of different caches and
2398				// thus maybe of different underlying file systems here. This
2399				// is a potential problem for loop file systems/devices, since
2400				// we could mark a page busy that would need to be accessed
2401				// when writing back another page, thus causing a deadlock.
2402
2403			DEBUG_PAGE_ACCESS_END(page);
2404
2405			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
2406			TPW(WritePage(page));
2407
2408			cache->AcquireRefLocked();
2409			numPages++;
2410		}
2411
2412#ifdef TRACE_VM_PAGE
2413		pageCollectionTime += system_time();
2414#endif
2415		if (numPages == 0)
2416			continue;
2417
2418		// write pages to disk and do all the cleanup
2419#ifdef TRACE_VM_PAGE
2420		pageWritingTime -= system_time();
2421#endif
2422		uint32 failedPages = run.Go();
2423#ifdef TRACE_VM_PAGE
2424		pageWritingTime += system_time();
2425
2426		// debug output only...
2427		writtenPages += numPages;
2428		if (writtenPages >= 1024) {
2429			bigtime_t now = system_time();
2430			TRACE(("page writer: wrote 1024 pages (total: %" B_PRIu64 " ms, "
2431				"collect: %" B_PRIu64 " ms, write: %" B_PRIu64 " ms)\n",
2432				(now - lastWrittenTime) / 1000,
2433				pageCollectionTime / 1000, pageWritingTime / 1000));
2434			lastWrittenTime = now;
2435
2436			writtenPages -= 1024;
2437			pageCollectionTime = 0;
2438			pageWritingTime = 0;
2439		}
2440#endif
2441
2442		if (failedPages == numPages)
2443			pagesSinceLastSuccessfulWrite += modifiedPages - maxPagesToSee;
2444		else
2445			pagesSinceLastSuccessfulWrite = 0;
2446	}
2447
2448	return B_OK;
2449}
2450
2451
2452// #pragma mark -
2453
2454
2455// TODO: This should be done in the page daemon!
2456#if 0
2457#if ENABLE_SWAP_SUPPORT
2458static bool
2459free_page_swap_space(int32 index)
2460{
2461	vm_page *page = vm_page_at_index(index);
2462	PageCacheLocker locker(page);
2463	if (!locker.IsLocked())
2464		return false;
2465
2466	DEBUG_PAGE_ACCESS_START(page);
2467
2468	VMCache* cache = page->Cache();
2469	if (cache->temporary && page->WiredCount() == 0
2470			&& cache->HasPage(page->cache_offset << PAGE_SHIFT)
2471			&& page->usage_count > 0) {
2472		// TODO: how to judge a page is highly active?
2473		if (swap_free_page_swap_space(page)) {
2474			// We need to mark the page modified, since otherwise it could be
2475			// stolen and we'd lose its data.
2476			vm_page_set_state(page, PAGE_STATE_MODIFIED);
2477			TD(FreedPageSwap(page));
2478			DEBUG_PAGE_ACCESS_END(page);
2479			return true;
2480		}
2481	}
2482	DEBUG_PAGE_ACCESS_END(page);
2483	return false;
2484}
2485#endif
2486#endif	// 0
2487
2488
2489static vm_page *
2490find_cached_page_candidate(struct vm_page &marker)
2491{
2492	DEBUG_PAGE_ACCESS_CHECK(&marker);
2493
2494	InterruptsSpinLocker locker(sCachedPageQueue.GetLock());
2495	vm_page *page;
2496
2497	if (marker.State() == PAGE_STATE_UNUSED) {
2498		// Get the first free pages of the (in)active queue
2499		page = sCachedPageQueue.Head();
2500	} else {
2501		// Get the next page of the current queue
2502		if (marker.State() != PAGE_STATE_CACHED) {
2503			panic("invalid marker %p state", &marker);
2504			return NULL;
2505		}
2506
2507		page = sCachedPageQueue.Next(&marker);
2508		sCachedPageQueue.Remove(&marker);
2509		marker.SetState(PAGE_STATE_UNUSED);
2510	}
2511
2512	while (page != NULL) {
2513		if (!page->busy) {
2514			// we found a candidate, insert marker
2515			marker.SetState(PAGE_STATE_CACHED);
2516			sCachedPageQueue.InsertAfter(page, &marker);
2517			return page;
2518		}
2519
2520		page = sCachedPageQueue.Next(page);
2521	}
2522
2523	return NULL;
2524}
2525
2526
2527static bool
2528free_cached_page(vm_page *page, bool dontWait)
2529{
2530	// try to lock the page's cache
2531	if (vm_cache_acquire_locked_page_cache(page, dontWait) == NULL)
2532		return false;
2533	VMCache* cache = page->Cache();
2534
2535	AutoLocker<VMCache> cacheLocker(cache, true);
2536	MethodDeleter<VMCache> _2(cache, &VMCache::ReleaseRefLocked);
2537
2538	// check again if that page is still a candidate
2539	if (page->busy || page->State() != PAGE_STATE_CACHED)
2540		return false;
2541
2542	DEBUG_PAGE_ACCESS_START(page);
2543
2544	PAGE_ASSERT(page, !page->IsMapped());
2545	PAGE_ASSERT(page, !page->modified);
2546
2547	// we can now steal this page
2548
2549	cache->RemovePage(page);
2550		// Now the page doesn't have cache anymore, so no one else (e.g.
2551		// vm_page_allocate_page_run() can pick it up), since they would be
2552		// required to lock the cache first, which would fail.
2553
2554	sCachedPageQueue.RemoveUnlocked(page);
2555	return true;
2556}
2557
2558
2559static uint32
2560free_cached_pages(uint32 pagesToFree, bool dontWait)
2561{
2562	vm_page marker;
2563	init_page_marker(marker);
2564
2565	uint32 pagesFreed = 0;
2566
2567	while (pagesFreed < pagesToFree) {
2568		vm_page *page = find_cached_page_candidate(marker);
2569		if (page == NULL)
2570			break;
2571
2572		if (free_cached_page(page, dontWait)) {
2573			ReadLocker locker(sFreePageQueuesLock);
2574			page->SetState(PAGE_STATE_FREE);
2575			DEBUG_PAGE_ACCESS_END(page);
2576			sFreePageQueue.PrependUnlocked(page);
2577			locker.Unlock();
2578
2579			TA(StolenPage());
2580
2581			pagesFreed++;
2582		}
2583	}
2584
2585	remove_page_marker(marker);
2586
2587	return pagesFreed;
2588}
2589
2590
2591static void
2592idle_scan_active_pages(page_stats& pageStats)
2593{
2594	VMPageQueue& queue = sActivePageQueue;
2595
2596	// We want to scan the whole queue in roughly kIdleRunsForFullQueue runs.
2597	uint32 maxToScan = queue.Count() / kIdleRunsForFullQueue + 1;
2598
2599	while (maxToScan > 0) {
2600		maxToScan--;
2601
2602		// Get the next page. Note that we don't bother to lock here. We go with
2603		// the assumption that on all architectures reading/writing pointers is
2604		// atomic. Beyond that it doesn't really matter. We have to unlock the
2605		// queue anyway to lock the page's cache, and we'll recheck afterwards.
2606		vm_page* page = queue.Head();
2607		if (page == NULL)
2608			break;
2609
2610		// lock the page's cache
2611		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2612		if (cache == NULL)
2613			continue;
2614
2615		if (page->State() != PAGE_STATE_ACTIVE) {
2616			// page is no longer in the cache or in this queue
2617			cache->ReleaseRefAndUnlock();
2618			continue;
2619		}
2620
2621		if (page->busy) {
2622			// page is busy -- requeue at the end
2623			vm_page_requeue(page, true);
2624			cache->ReleaseRefAndUnlock();
2625			continue;
2626		}
2627
2628		DEBUG_PAGE_ACCESS_START(page);
2629
2630		// Get the page active/modified flags and update the page's usage count.
2631		// We completely unmap inactive temporary pages. This saves us to
2632		// iterate through the inactive list as well, since we'll be notified
2633		// via page fault whenever such an inactive page is used again.
2634		// We don't remove the mappings of non-temporary pages, since we
2635		// wouldn't notice when those would become unused and could thus be
2636		// moved to the cached list.
2637		int32 usageCount;
2638		if (page->WiredCount() > 0 || page->usage_count > 0
2639			|| !cache->temporary) {
2640			usageCount = vm_clear_page_mapping_accessed_flags(page);
2641		} else
2642			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2643
2644		if (usageCount > 0) {
2645			usageCount += page->usage_count + kPageUsageAdvance;
2646			if (usageCount > kPageUsageMax)
2647				usageCount = kPageUsageMax;
2648// TODO: This would probably also be the place to reclaim swap space.
2649		} else {
2650			usageCount += page->usage_count - (int32)kPageUsageDecline;
2651			if (usageCount < 0) {
2652				usageCount = 0;
2653				set_page_state(page, PAGE_STATE_INACTIVE);
2654			}
2655		}
2656
2657		page->usage_count = usageCount;
2658
2659		DEBUG_PAGE_ACCESS_END(page);
2660
2661		cache->ReleaseRefAndUnlock();
2662	}
2663}
2664
2665
2666static void
2667full_scan_inactive_pages(page_stats& pageStats, int32 despairLevel)
2668{
2669	int32 pagesToFree = pageStats.unsatisfiedReservations
2670		+ sFreeOrCachedPagesTarget
2671		- (pageStats.totalFreePages + pageStats.cachedPages);
2672	if (pagesToFree <= 0)
2673		return;
2674
2675	bigtime_t time = system_time();
2676	uint32 pagesScanned = 0;
2677	uint32 pagesToCached = 0;
2678	uint32 pagesToModified = 0;
2679	uint32 pagesToActive = 0;
2680
2681	// Determine how many pages at maximum to send to the modified queue. Since
2682	// it is relatively expensive to page out pages, we do that on a grander
2683	// scale only when things get desperate.
2684	uint32 maxToFlush = despairLevel <= 1 ? 32 : 10000;
2685
2686	vm_page marker;
2687	init_page_marker(marker);
2688
2689	VMPageQueue& queue = sInactivePageQueue;
2690	InterruptsSpinLocker queueLocker(queue.GetLock());
2691	uint32 maxToScan = queue.Count();
2692
2693	vm_page* nextPage = queue.Head();
2694
2695	while (pagesToFree > 0 && maxToScan > 0) {
2696		maxToScan--;
2697
2698		// get the next page
2699		vm_page* page = nextPage;
2700		if (page == NULL)
2701			break;
2702		nextPage = queue.Next(page);
2703
2704		if (page->busy)
2705			continue;
2706
2707		// mark the position
2708		queue.InsertAfter(page, &marker);
2709		queueLocker.Unlock();
2710
2711		// lock the page's cache
2712		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2713		if (cache == NULL || page->busy
2714				|| page->State() != PAGE_STATE_INACTIVE) {
2715			if (cache != NULL)
2716				cache->ReleaseRefAndUnlock();
2717			queueLocker.Lock();
2718			nextPage = queue.Next(&marker);
2719			queue.Remove(&marker);
2720			continue;
2721		}
2722
2723		pagesScanned++;
2724
2725		DEBUG_PAGE_ACCESS_START(page);
2726
2727		// Get the accessed count, clear the accessed/modified flags and
2728		// unmap the page, if it hasn't been accessed.
2729		int32 usageCount;
2730		if (page->WiredCount() > 0)
2731			usageCount = vm_clear_page_mapping_accessed_flags(page);
2732		else
2733			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2734
2735		// update usage count
2736		if (usageCount > 0) {
2737			usageCount += page->usage_count + kPageUsageAdvance;
2738			if (usageCount > kPageUsageMax)
2739				usageCount = kPageUsageMax;
2740		} else {
2741			usageCount += page->usage_count - (int32)kPageUsageDecline;
2742			if (usageCount < 0)
2743				usageCount = 0;
2744		}
2745
2746		page->usage_count = usageCount;
2747
2748		// Move to fitting queue or requeue:
2749		// * Active mapped pages go to the active queue.
2750		// * Inactive mapped (i.e. wired) pages are requeued.
2751		// * The remaining pages are cachable. Thus, if unmodified they go to
2752		//   the cached queue, otherwise to the modified queue (up to a limit).
2753		//   Note that until in the idle scanning we don't exempt pages of
2754		//   temporary caches. Apparently we really need memory, so we better
2755		//   page out memory as well.
2756		bool isMapped = page->IsMapped();
2757		if (usageCount > 0) {
2758			if (isMapped) {
2759				set_page_state(page, PAGE_STATE_ACTIVE);
2760				pagesToActive++;
2761			} else
2762				vm_page_requeue(page, true);
2763		} else if (isMapped) {
2764			vm_page_requeue(page, true);
2765		} else if (!page->modified) {
2766			set_page_state(page, PAGE_STATE_CACHED);
2767			pagesToFree--;
2768			pagesToCached++;
2769		} else if (maxToFlush > 0) {
2770			set_page_state(page, PAGE_STATE_MODIFIED);
2771			maxToFlush--;
2772			pagesToModified++;
2773		} else
2774			vm_page_requeue(page, true);
2775
2776		DEBUG_PAGE_ACCESS_END(page);
2777
2778		cache->ReleaseRefAndUnlock();
2779
2780		// remove the marker
2781		queueLocker.Lock();
2782		nextPage = queue.Next(&marker);
2783		queue.Remove(&marker);
2784	}
2785
2786	queueLocker.Unlock();
2787
2788	time = system_time() - time;
2789	TRACE_DAEMON("  -> inactive scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2790		", moved: %" B_PRIu32 " -> cached, %" B_PRIu32 " -> modified, %"
2791		B_PRIu32 " -> active\n", time, pagesScanned, pagesToCached,
2792		pagesToModified, pagesToActive);
2793
2794	// wake up the page writer, if we tossed it some pages
2795	if (pagesToModified > 0)
2796		sPageWriterCondition.WakeUp();
2797}
2798
2799
2800static void
2801full_scan_active_pages(page_stats& pageStats, int32 despairLevel)
2802{
2803	vm_page marker;
2804	init_page_marker(marker);
2805
2806	VMPageQueue& queue = sActivePageQueue;
2807	InterruptsSpinLocker queueLocker(queue.GetLock());
2808	uint32 maxToScan = queue.Count();
2809
2810	int32 pagesToDeactivate = pageStats.unsatisfiedReservations
2811		+ sFreeOrCachedPagesTarget
2812		- (pageStats.totalFreePages + pageStats.cachedPages)
2813		+ std::max((int32)sInactivePagesTarget - (int32)maxToScan, (int32)0);
2814	if (pagesToDeactivate <= 0)
2815		return;
2816
2817	bigtime_t time = system_time();
2818	uint32 pagesAccessed = 0;
2819	uint32 pagesToInactive = 0;
2820	uint32 pagesScanned = 0;
2821
2822	vm_page* nextPage = queue.Head();
2823
2824	while (pagesToDeactivate > 0 && maxToScan > 0) {
2825		maxToScan--;
2826
2827		// get the next page
2828		vm_page* page = nextPage;
2829		if (page == NULL)
2830			break;
2831		nextPage = queue.Next(page);
2832
2833		if (page->busy)
2834			continue;
2835
2836		// mark the position
2837		queue.InsertAfter(page, &marker);
2838		queueLocker.Unlock();
2839
2840		// lock the page's cache
2841		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2842		if (cache == NULL || page->busy || page->State() != PAGE_STATE_ACTIVE) {
2843			if (cache != NULL)
2844				cache->ReleaseRefAndUnlock();
2845			queueLocker.Lock();
2846			nextPage = queue.Next(&marker);
2847			queue.Remove(&marker);
2848			continue;
2849		}
2850
2851		pagesScanned++;
2852
2853		DEBUG_PAGE_ACCESS_START(page);
2854
2855		// Get the page active/modified flags and update the page's usage count.
2856		int32 usageCount = vm_clear_page_mapping_accessed_flags(page);
2857
2858		if (usageCount > 0) {
2859			usageCount += page->usage_count + kPageUsageAdvance;
2860			if (usageCount > kPageUsageMax)
2861				usageCount = kPageUsageMax;
2862			pagesAccessed++;
2863// TODO: This would probably also be the place to reclaim swap space.
2864		} else {
2865			usageCount += page->usage_count - (int32)kPageUsageDecline;
2866			if (usageCount <= 0) {
2867				usageCount = 0;
2868				set_page_state(page, PAGE_STATE_INACTIVE);
2869				pagesToInactive++;
2870			}
2871		}
2872
2873		page->usage_count = usageCount;
2874
2875		DEBUG_PAGE_ACCESS_END(page);
2876
2877		cache->ReleaseRefAndUnlock();
2878
2879		// remove the marker
2880		queueLocker.Lock();
2881		nextPage = queue.Next(&marker);
2882		queue.Remove(&marker);
2883	}
2884
2885	time = system_time() - time;
2886	TRACE_DAEMON("  ->   active scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2887		", moved: %" B_PRIu32 " -> inactive, encountered %" B_PRIu32 " accessed"
2888		" ones\n", time, pagesScanned, pagesToInactive, pagesAccessed);
2889}
2890
2891
2892static void
2893page_daemon_idle_scan(page_stats& pageStats)
2894{
2895	TRACE_DAEMON("page daemon: idle run\n");
2896
2897	if (pageStats.totalFreePages < (int32)sFreePagesTarget) {
2898		// We want more actually free pages, so free some from the cached
2899		// ones.
2900		uint32 freed = free_cached_pages(
2901			sFreePagesTarget - pageStats.totalFreePages, false);
2902		if (freed > 0)
2903			unreserve_pages(freed);
2904		get_page_stats(pageStats);
2905	}
2906
2907	// Walk the active list and move pages to the inactive queue.
2908	get_page_stats(pageStats);
2909	idle_scan_active_pages(pageStats);
2910}
2911
2912
2913static void
2914page_daemon_full_scan(page_stats& pageStats, int32 despairLevel)
2915{
2916	TRACE_DAEMON("page daemon: full run: free: %" B_PRIu32 ", cached: %"
2917		B_PRIu32 ", to free: %" B_PRIu32 "\n", pageStats.totalFreePages,
2918		pageStats.cachedPages, pageStats.unsatisfiedReservations
2919			+ sFreeOrCachedPagesTarget
2920			- (pageStats.totalFreePages + pageStats.cachedPages));
2921
2922	// Walk the inactive list and transfer pages to the cached and modified
2923	// queues.
2924	full_scan_inactive_pages(pageStats, despairLevel);
2925
2926	// Free cached pages. Also wake up reservation waiters.
2927	get_page_stats(pageStats);
2928	int32 pagesToFree = pageStats.unsatisfiedReservations + sFreePagesTarget
2929		- (pageStats.totalFreePages);
2930	if (pagesToFree > 0) {
2931		uint32 freed = free_cached_pages(pagesToFree, true);
2932		if (freed > 0)
2933			unreserve_pages(freed);
2934	}
2935
2936	// Walk the active list and move pages to the inactive queue.
2937	get_page_stats(pageStats);
2938	full_scan_active_pages(pageStats, despairLevel);
2939}
2940
2941
2942static status_t
2943page_daemon(void* /*unused*/)
2944{
2945	int32 despairLevel = 0;
2946
2947	while (true) {
2948		sPageDaemonCondition.ClearActivated();
2949
2950		// evaluate the free pages situation
2951		page_stats pageStats;
2952		get_page_stats(pageStats);
2953
2954		if (!do_active_paging(pageStats)) {
2955			// Things look good -- just maintain statistics and keep the pool
2956			// of actually free pages full enough.
2957			despairLevel = 0;
2958			page_daemon_idle_scan(pageStats);
2959			sPageDaemonCondition.Wait(kIdleScanWaitInterval, false);
2960		} else {
2961			// Not enough free pages. We need to do some real work.
2962			despairLevel = std::max(despairLevel + 1, (int32)3);
2963			page_daemon_full_scan(pageStats, despairLevel);
2964
2965			// Don't wait after the first full scan, but rather immediately
2966			// check whether we were successful in freeing enough pages and
2967			// re-run with increased despair level. The first scan is
2968			// conservative with respect to moving inactive modified pages to
2969			// the modified list to avoid thrashing. The second scan, however,
2970			// will not hold back.
2971			if (despairLevel > 1)
2972				snooze(kBusyScanWaitInterval);
2973		}
2974	}
2975
2976	return B_OK;
2977}
2978
2979
2980/*!	Returns how many pages could *not* be reserved.
2981*/
2982static uint32
2983reserve_pages(uint32 count, int priority, bool dontWait)
2984{
2985	int32 dontTouch = kPageReserveForPriority[priority];
2986
2987	while (true) {
2988		count -= reserve_some_pages(count, dontTouch);
2989		if (count == 0)
2990			return 0;
2991
2992		if (sUnsatisfiedPageReservations == 0) {
2993			count -= free_cached_pages(count, dontWait);
2994			if (count == 0)
2995				return count;
2996		}
2997
2998		if (dontWait)
2999			return count;
3000
3001		// we need to wait for pages to become available
3002
3003		MutexLocker pageDeficitLocker(sPageDeficitLock);
3004
3005		bool notifyDaemon = sUnsatisfiedPageReservations == 0;
3006		sUnsatisfiedPageReservations += count;
3007
3008		if (atomic_get(&sUnreservedFreePages) > dontTouch) {
3009			// the situation changed
3010			sUnsatisfiedPageReservations -= count;
3011			continue;
3012		}
3013
3014		PageReservationWaiter waiter;
3015		waiter.dontTouch = dontTouch;
3016		waiter.missing = count;
3017		waiter.thread = thread_get_current_thread();
3018		waiter.threadPriority = waiter.thread->priority;
3019
3020		// insert ordered (i.e. after all waiters with higher or equal priority)
3021		PageReservationWaiter* otherWaiter = NULL;
3022		for (PageReservationWaiterList::Iterator it
3023				= sPageReservationWaiters.GetIterator();
3024			(otherWaiter = it.Next()) != NULL;) {
3025			if (waiter < *otherWaiter)
3026				break;
3027		}
3028
3029		sPageReservationWaiters.InsertBefore(otherWaiter, &waiter);
3030
3031		thread_prepare_to_block(waiter.thread, 0, THREAD_BLOCK_TYPE_OTHER,
3032			"waiting for pages");
3033
3034		if (notifyDaemon)
3035			sPageDaemonCondition.WakeUp();
3036
3037		pageDeficitLocker.Unlock();
3038
3039		low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
3040		thread_block();
3041
3042		pageDeficitLocker.Lock();
3043
3044		return 0;
3045	}
3046}
3047
3048
3049//	#pragma mark - private kernel API
3050
3051
3052/*!	Writes a range of modified pages of a cache to disk.
3053	You need to hold the VMCache lock when calling this function.
3054	Note that the cache lock is released in this function.
3055	\param cache The cache.
3056	\param firstPage Offset (in page size units) of the first page in the range.
3057	\param endPage End offset (in page size units) of the page range. The page
3058		at this offset is not included.
3059*/
3060status_t
3061vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
3062	uint32 endPage)
3063{
3064	static const int32 kMaxPages = 256;
3065	int32 maxPages = cache->MaxPagesPerWrite();
3066	if (maxPages < 0 || maxPages > kMaxPages)
3067		maxPages = kMaxPages;
3068
3069	const uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
3070		| HEAP_DONT_LOCK_KERNEL_SPACE;
3071
3072	PageWriteWrapper stackWrappersPool[2];
3073	PageWriteWrapper* stackWrappers[1];
3074	PageWriteWrapper* wrapperPool
3075		= new(malloc_flags(allocationFlags)) PageWriteWrapper[maxPages + 1];
3076	PageWriteWrapper** wrappers
3077		= new(malloc_flags(allocationFlags)) PageWriteWrapper*[maxPages];
3078	if (wrapperPool == NULL || wrappers == NULL) {
3079		// don't fail, just limit our capabilities
3080		delete[] wrapperPool;
3081		delete[] wrappers;
3082		wrapperPool = stackWrappersPool;
3083		wrappers = stackWrappers;
3084		maxPages = 1;
3085	}
3086
3087	int32 nextWrapper = 0;
3088	int32 usedWrappers = 0;
3089
3090	PageWriteTransfer transfer;
3091	bool transferEmpty = true;
3092
3093	VMCachePagesTree::Iterator it
3094		= cache->pages.GetIterator(firstPage, true, true);
3095
3096	while (true) {
3097		vm_page* page = it.Next();
3098		if (page == NULL || page->cache_offset >= endPage) {
3099			if (transferEmpty)
3100				break;
3101
3102			page = NULL;
3103		}
3104
3105		if (page != NULL) {
3106			if (page->busy
3107				|| (page->State() != PAGE_STATE_MODIFIED
3108					&& !vm_test_map_modification(page))) {
3109				page = NULL;
3110			}
3111		}
3112
3113		PageWriteWrapper* wrapper = NULL;
3114		if (page != NULL) {
3115			wrapper = &wrapperPool[nextWrapper++];
3116			if (nextWrapper > maxPages)
3117				nextWrapper = 0;
3118
3119			DEBUG_PAGE_ACCESS_START(page);
3120
3121			wrapper->SetTo(page);
3122
3123			if (transferEmpty || transfer.AddPage(page)) {
3124				if (transferEmpty) {
3125					transfer.SetTo(NULL, page, maxPages);
3126					transferEmpty = false;
3127				}
3128
3129				DEBUG_PAGE_ACCESS_END(page);
3130
3131				wrappers[usedWrappers++] = wrapper;
3132				continue;
3133			}
3134
3135			DEBUG_PAGE_ACCESS_END(page);
3136		}
3137
3138		if (transferEmpty)
3139			continue;
3140
3141		cache->Unlock();
3142		status_t status = transfer.Schedule(0);
3143		cache->Lock();
3144
3145		for (int32 i = 0; i < usedWrappers; i++)
3146			wrappers[i]->Done(status);
3147
3148		usedWrappers = 0;
3149
3150		if (page != NULL) {
3151			transfer.SetTo(NULL, page, maxPages);
3152			wrappers[usedWrappers++] = wrapper;
3153		} else
3154			transferEmpty = true;
3155	}
3156
3157	if (wrapperPool != stackWrappersPool) {
3158		delete[] wrapperPool;
3159		delete[] wrappers;
3160	}
3161
3162	return B_OK;
3163}
3164
3165
3166/*!	You need to hold the VMCache lock when calling this function.
3167	Note that the cache lock is released in this function.
3168*/
3169status_t
3170vm_page_write_modified_pages(VMCache *cache)
3171{
3172	return vm_page_write_modified_page_range(cache, 0,
3173		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
3174}
3175
3176
3177/*!	Schedules the page writer to write back the specified \a page.
3178	Note, however, that it might not do this immediately, and it can well
3179	take several seconds until the page is actually written out.
3180*/
3181void
3182vm_page_schedule_write_page(vm_page *page)
3183{
3184	PAGE_ASSERT(page, page->State() == PAGE_STATE_MODIFIED);
3185
3186	vm_page_requeue(page, false);
3187
3188	sPageWriterCondition.WakeUp();
3189}
3190
3191
3192/*!	Cache must be locked.
3193*/
3194void
3195vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
3196	uint32 endPage)
3197{
3198	uint32 modified = 0;
3199	for (VMCachePagesTree::Iterator it
3200				= cache->pages.GetIterator(firstPage, true, true);
3201			vm_page *page = it.Next();) {
3202		if (page->cache_offset >= endPage)
3203			break;
3204
3205		if (!page->busy && page->State() == PAGE_STATE_MODIFIED) {
3206			DEBUG_PAGE_ACCESS_START(page);
3207			vm_page_requeue(page, false);
3208			modified++;
3209			DEBUG_PAGE_ACCESS_END(page);
3210		}
3211	}
3212
3213	if (modified > 0)
3214		sPageWriterCondition.WakeUp();
3215}
3216
3217
3218void
3219vm_page_init_num_pages(kernel_args *args)
3220{
3221	// calculate the size of memory by looking at the physical_memory_range array
3222	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
3223	page_num_t physicalPagesEnd = sPhysicalPageOffset
3224		+ args->physical_memory_range[0].size / B_PAGE_SIZE;
3225
3226	sNonExistingPages = 0;
3227	sIgnoredPages = args->ignored_physical_memory / B_PAGE_SIZE;
3228
3229	for (uint32 i = 1; i < args->num_physical_memory_ranges; i++) {
3230		page_num_t start = args->physical_memory_range[i].start / B_PAGE_SIZE;
3231		if (start > physicalPagesEnd)
3232			sNonExistingPages += start - physicalPagesEnd;
3233		physicalPagesEnd = start
3234			+ args->physical_memory_range[i].size / B_PAGE_SIZE;
3235
3236#ifdef LIMIT_AVAILABLE_MEMORY
3237		page_num_t available
3238			= physicalPagesEnd - sPhysicalPageOffset - sNonExistingPages;
3239		if (available > LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE)) {
3240			physicalPagesEnd = sPhysicalPageOffset + sNonExistingPages
3241				+ LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE);
3242			break;
3243		}
3244#endif
3245	}
3246
3247	TRACE(("first phys page = %#" B_PRIxPHYSADDR ", end %#" B_PRIxPHYSADDR "\n",
3248		sPhysicalPageOffset, physicalPagesEnd));
3249
3250	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
3251}
3252
3253
3254status_t
3255vm_page_init(kernel_args *args)
3256{
3257	TRACE(("vm_page_init: entry\n"));
3258
3259	// init page queues
3260	sModifiedPageQueue.Init("modified pages queue");
3261	sInactivePageQueue.Init("inactive pages queue");
3262	sActivePageQueue.Init("active pages queue");
3263	sCachedPageQueue.Init("cached pages queue");
3264	sFreePageQueue.Init("free pages queue");
3265	sClearPageQueue.Init("clear pages queue");
3266
3267	new (&sPageReservationWaiters) PageReservationWaiterList;
3268
3269	// map in the new free page table
3270	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
3271		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
3272
3273	TRACE(("vm_init: putting free_page_table @ %p, # ents %" B_PRIuPHYSADDR
3274		" (size %#" B_PRIxPHYSADDR ")\n", sPages, sNumPages,
3275		(phys_addr_t)(sNumPages * sizeof(vm_page))));
3276
3277	// initialize the free page table
3278	for (uint32 i = 0; i < sNumPages; i++) {
3279		sPages[i].Init(sPhysicalPageOffset + i);
3280		sFreePageQueue.Append(&sPages[i]);
3281
3282#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3283		sPages[i].allocation_tracking_info.Clear();
3284#endif
3285	}
3286
3287	sUnreservedFreePages = sNumPages;
3288
3289	TRACE(("initialized table\n"));
3290
3291	// mark the ranges between usable physical memory unused
3292	phys_addr_t previousEnd = 0;
3293	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
3294		phys_addr_t base = args->physical_memory_range[i].start;
3295		phys_size_t size = args->physical_memory_range[i].size;
3296		if (base > previousEnd) {
3297			mark_page_range_in_use(previousEnd / B_PAGE_SIZE,
3298				(base - previousEnd) / B_PAGE_SIZE, false);
3299		}
3300		previousEnd = base + size;
3301	}
3302
3303	// mark the allocated physical page ranges wired
3304	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3305		mark_page_range_in_use(
3306			args->physical_allocated_range[i].start / B_PAGE_SIZE,
3307			args->physical_allocated_range[i].size / B_PAGE_SIZE, true);
3308	}
3309
3310	// The target of actually free pages. This must be at least the system
3311	// reserve, but should be a few more pages, so we don't have to extract
3312	// a cached page with each allocation.
3313	sFreePagesTarget = VM_PAGE_RESERVE_USER
3314		+ std::max((page_num_t)32, (sNumPages - sNonExistingPages) / 1024);
3315
3316	// The target of free + cached and inactive pages. On low-memory machines
3317	// keep things tight. free + cached is the pool of immediately allocatable
3318	// pages. We want a few inactive pages, so when we're actually paging, we
3319	// have a reasonably large set of pages to work with.
3320	if (sUnreservedFreePages < 16 * 1024) {
3321		sFreeOrCachedPagesTarget = sFreePagesTarget + 128;
3322		sInactivePagesTarget = sFreePagesTarget / 3;
3323	} else {
3324		sFreeOrCachedPagesTarget = 2 * sFreePagesTarget;
3325		sInactivePagesTarget = sFreePagesTarget / 2;
3326	}
3327
3328	TRACE(("vm_page_init: exit\n"));
3329
3330	return B_OK;
3331}
3332
3333
3334status_t
3335vm_page_init_post_area(kernel_args *args)
3336{
3337	void *dummy;
3338
3339	dummy = sPages;
3340	create_area("page structures", &dummy, B_EXACT_ADDRESS,
3341		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
3342		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3343
3344	add_debugger_command("page_stats", &dump_page_stats,
3345		"Dump statistics about page usage");
3346	add_debugger_command_etc("page", &dump_page,
3347		"Dump page info",
3348		"[ \"-p\" | \"-v\" ] [ \"-m\" ] <address>\n"
3349		"Prints information for the physical page. If neither \"-p\" nor\n"
3350		"\"-v\" are given, the provided address is interpreted as address of\n"
3351		"the vm_page data structure for the page in question. If \"-p\" is\n"
3352		"given, the address is the physical address of the page. If \"-v\" is\n"
3353		"given, the address is interpreted as virtual address in the current\n"
3354		"thread's address space and for the page it is mapped to (if any)\n"
3355		"information are printed. If \"-m\" is specified, the command will\n"
3356		"search all known address spaces for mappings to that page and print\n"
3357		"them.\n", 0);
3358	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
3359	add_debugger_command("find_page", &find_page,
3360		"Find out which queue a page is actually in");
3361
3362#ifdef TRACK_PAGE_USAGE_STATS
3363	add_debugger_command_etc("page_usage", &dump_page_usage_stats,
3364		"Dumps statistics about page usage counts",
3365		"\n"
3366		"Dumps statistics about page usage counts.\n",
3367		B_KDEBUG_DONT_PARSE_ARGUMENTS);
3368#endif
3369
3370#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3371	add_debugger_command_etc("page_allocations_per_caller",
3372		&dump_page_allocations_per_caller,
3373		"Dump current page allocations summed up per caller",
3374		"[ -d <caller> ] [ -r ]\n"
3375		"The current allocations will by summed up by caller (their count)\n"
3376		"printed in decreasing order by count.\n"
3377		"If \"-d\" is given, each allocation for caller <caller> is printed\n"
3378		"including the respective stack trace.\n"
3379		"If \"-r\" is given, the allocation infos are reset after gathering\n"
3380		"the information, so the next command invocation will only show the\n"
3381		"allocations made after the reset.\n", 0);
3382	add_debugger_command_etc("page_allocation_infos",
3383		&dump_page_allocation_infos,
3384		"Dump current page allocations",
3385		"[ --stacktrace ] [ -p <page number> ] [ --team <team ID> ] "
3386		"[ --thread <thread ID> ]\n"
3387		"The current allocations filtered by optional values will be printed.\n"
3388		"The optional \"-p\" page number filters for a specific page,\n"
3389		"with \"--team\" and \"--thread\" allocations by specific teams\n"
3390		"and/or threads can be filtered (these only work if a corresponding\n"
3391		"tracing entry is still available).\n"
3392		"If \"--stacktrace\" is given, then stack traces of the allocation\n"
3393		"callers are printed, where available\n", 0);
3394#endif
3395
3396	return B_OK;
3397}
3398
3399
3400status_t
3401vm_page_init_post_thread(kernel_args *args)
3402{
3403	new (&sFreePageCondition) ConditionVariable;
3404	sFreePageCondition.Publish(&sFreePageQueue, "free page");
3405
3406	// create a kernel thread to clear out pages
3407
3408	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
3409		B_LOWEST_ACTIVE_PRIORITY, NULL);
3410	resume_thread(thread);
3411
3412	// start page writer
3413
3414	sPageWriterCondition.Init("page writer");
3415
3416	thread = spawn_kernel_thread(&page_writer, "page writer",
3417		B_NORMAL_PRIORITY + 1, NULL);
3418	resume_thread(thread);
3419
3420	// start page daemon
3421
3422	sPageDaemonCondition.Init("page daemon");
3423
3424	thread = spawn_kernel_thread(&page_daemon, "page daemon",
3425		B_NORMAL_PRIORITY, NULL);
3426	resume_thread(thread);
3427
3428	return B_OK;
3429}
3430
3431
3432status_t
3433vm_mark_page_inuse(page_num_t page)
3434{
3435	return vm_mark_page_range_inuse(page, 1);
3436}
3437
3438
3439status_t
3440vm_mark_page_range_inuse(page_num_t startPage, page_num_t length)
3441{
3442	return mark_page_range_in_use(startPage, length, false);
3443}
3444
3445
3446/*!	Unreserve pages previously reserved with vm_page_reserve_pages().
3447*/
3448void
3449vm_page_unreserve_pages(vm_page_reservation* reservation)
3450{
3451	uint32 count = reservation->count;
3452	reservation->count = 0;
3453
3454	if (count == 0)
3455		return;
3456
3457	TA(UnreservePages(count));
3458
3459	unreserve_pages(count);
3460}
3461
3462
3463/*!	With this call, you can reserve a number of free pages in the system.
3464	They will only be handed out to someone who has actually reserved them.
3465	This call returns as soon as the number of requested pages has been
3466	reached.
3467	The caller must not hold any cache lock or the function might deadlock.
3468*/
3469void
3470vm_page_reserve_pages(vm_page_reservation* reservation, uint32 count,
3471	int priority)
3472{
3473	reservation->count = count;
3474
3475	if (count == 0)
3476		return;
3477
3478	TA(ReservePages(count));
3479
3480	reserve_pages(count, priority, false);
3481}
3482
3483
3484bool
3485vm_page_try_reserve_pages(vm_page_reservation* reservation, uint32 count,
3486	int priority)
3487{
3488	if (count == 0) {
3489		reservation->count = count;
3490		return true;
3491	}
3492
3493	uint32 remaining = reserve_pages(count, priority, true);
3494	if (remaining == 0) {
3495		TA(ReservePages(count));
3496		reservation->count = count;
3497		return true;
3498	}
3499
3500	unreserve_pages(count - remaining);
3501
3502	return false;
3503}
3504
3505
3506vm_page *
3507vm_page_allocate_page(vm_page_reservation* reservation, uint32 flags)
3508{
3509	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3510	ASSERT(pageState != PAGE_STATE_FREE);
3511	ASSERT(pageState != PAGE_STATE_CLEAR);
3512
3513	ASSERT(reservation->count > 0);
3514	reservation->count--;
3515
3516	VMPageQueue* queue;
3517	VMPageQueue* otherQueue;
3518
3519	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3520		queue = &sClearPageQueue;
3521		otherQueue = &sFreePageQueue;
3522	} else {
3523		queue = &sFreePageQueue;
3524		otherQueue = &sClearPageQueue;
3525	}
3526
3527	ReadLocker locker(sFreePageQueuesLock);
3528
3529	vm_page* page = queue->RemoveHeadUnlocked();
3530	if (page == NULL) {
3531		// if the primary queue was empty, grab the page from the
3532		// secondary queue
3533		page = otherQueue->RemoveHeadUnlocked();
3534
3535		if (page == NULL) {
3536			// Unlikely, but possible: the page we have reserved has moved
3537			// between the queues after we checked the first queue. Grab the
3538			// write locker to make sure this doesn't happen again.
3539			locker.Unlock();
3540			WriteLocker writeLocker(sFreePageQueuesLock);
3541
3542			page = queue->RemoveHead();
3543			if (page == NULL)
3544				otherQueue->RemoveHead();
3545
3546			if (page == NULL) {
3547				panic("Had reserved page, but there is none!");
3548				return NULL;
3549			}
3550
3551			// downgrade to read lock
3552			locker.Lock();
3553		}
3554	}
3555
3556	if (page->CacheRef() != NULL)
3557		panic("supposed to be free page %p has cache\n", page);
3558
3559	DEBUG_PAGE_ACCESS_START(page);
3560
3561	int oldPageState = page->State();
3562	page->SetState(pageState);
3563	page->busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3564	page->usage_count = 0;
3565	page->accessed = false;
3566	page->modified = false;
3567
3568	locker.Unlock();
3569
3570	if (pageState < PAGE_STATE_FIRST_UNQUEUED)
3571		sPageQueues[pageState].AppendUnlocked(page);
3572
3573	// clear the page, if we had to take it from the free queue and a clear
3574	// page was requested
3575	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0 && oldPageState != PAGE_STATE_CLEAR)
3576		clear_page(page);
3577
3578#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3579	page->allocation_tracking_info.Init(
3580		TA(AllocatePage(page->physical_page_number)));
3581#else
3582	TA(AllocatePage(page->physical_page_number));
3583#endif
3584
3585	return page;
3586}
3587
3588
3589static void
3590allocate_page_run_cleanup(VMPageQueue::PageList& freePages,
3591	VMPageQueue::PageList& clearPages)
3592{
3593	while (vm_page* page = freePages.RemoveHead()) {
3594		page->busy = false;
3595		page->SetState(PAGE_STATE_FREE);
3596		DEBUG_PAGE_ACCESS_END(page);
3597		sFreePageQueue.PrependUnlocked(page);
3598	}
3599
3600	while (vm_page* page = clearPages.RemoveHead()) {
3601		page->busy = false;
3602		page->SetState(PAGE_STATE_CLEAR);
3603		DEBUG_PAGE_ACCESS_END(page);
3604		sClearPageQueue.PrependUnlocked(page);
3605	}
3606}
3607
3608
3609/*!	Tries to allocate the a contiguous run of \a length pages starting at
3610	index \a start.
3611
3612	The caller must have write-locked the free/clear page queues. The function
3613	will unlock regardless of whether it succeeds or fails.
3614
3615	If the function fails, it cleans up after itself, i.e. it will free all
3616	pages it managed to allocate.
3617
3618	\param start The start index (into \c sPages) of the run.
3619	\param length The number of pages to allocate.
3620	\param flags Page allocation flags. Encodes the state the function shall
3621		set the allocated pages to, whether the pages shall be marked busy
3622		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3623		(VM_PAGE_ALLOC_CLEAR).
3624	\param freeClearQueueLocker Locked WriteLocker for the free/clear page
3625		queues in locked state. Will be unlocked by the function.
3626	\return The index of the first page that could not be allocated. \a length
3627		is returned when the function was successful.
3628*/
3629static page_num_t
3630allocate_page_run(page_num_t start, page_num_t length, uint32 flags,
3631	WriteLocker& freeClearQueueLocker)
3632{
3633	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3634	ASSERT(pageState != PAGE_STATE_FREE);
3635	ASSERT(pageState != PAGE_STATE_CLEAR);
3636	ASSERT(start + length <= sNumPages);
3637
3638	// Pull the free/clear pages out of their respective queues. Cached pages
3639	// are allocated later.
3640	page_num_t cachedPages = 0;
3641	VMPageQueue::PageList freePages;
3642	VMPageQueue::PageList clearPages;
3643	page_num_t i = 0;
3644	for (; i < length; i++) {
3645		bool pageAllocated = true;
3646		bool noPage = false;
3647		vm_page& page = sPages[start + i];
3648		switch (page.State()) {
3649			case PAGE_STATE_CLEAR:
3650				DEBUG_PAGE_ACCESS_START(&page);
3651				sClearPageQueue.Remove(&page);
3652				clearPages.Add(&page);
3653				break;
3654			case PAGE_STATE_FREE:
3655				DEBUG_PAGE_ACCESS_START(&page);
3656				sFreePageQueue.Remove(&page);
3657				freePages.Add(&page);
3658				break;
3659			case PAGE_STATE_CACHED:
3660				// We allocate cached pages later.
3661				cachedPages++;
3662				pageAllocated = false;
3663				break;
3664
3665			default:
3666				// Probably a page was cached when our caller checked. Now it's
3667				// gone and we have to abort.
3668				noPage = true;
3669				break;
3670		}
3671
3672		if (noPage)
3673			break;
3674
3675		if (pageAllocated) {
3676			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3677			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3678			page.usage_count = 0;
3679			page.accessed = false;
3680			page.modified = false;
3681		}
3682	}
3683
3684	if (i < length) {
3685		// failed to allocate a page -- free all that we've got
3686		allocate_page_run_cleanup(freePages, clearPages);
3687		return i;
3688	}
3689
3690	freeClearQueueLocker.Unlock();
3691
3692	if (cachedPages > 0) {
3693		// allocate the pages that weren't free but cached
3694		page_num_t freedCachedPages = 0;
3695		page_num_t nextIndex = start;
3696		vm_page* freePage = freePages.Head();
3697		vm_page* clearPage = clearPages.Head();
3698		while (cachedPages > 0) {
3699			// skip, if we've already got the page
3700			if (freePage != NULL && size_t(freePage - sPages) == nextIndex) {
3701				freePage = freePages.GetNext(freePage);
3702				nextIndex++;
3703				continue;
3704			}
3705			if (clearPage != NULL && size_t(clearPage - sPages) == nextIndex) {
3706				clearPage = clearPages.GetNext(clearPage);
3707				nextIndex++;
3708				continue;
3709			}
3710
3711			// free the page, if it is still cached
3712			vm_page& page = sPages[nextIndex];
3713			if (!free_cached_page(&page, false)) {
3714				// TODO: if the page turns out to have been freed already,
3715				// there would be no need to fail
3716				break;
3717			}
3718
3719			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3720			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3721			page.usage_count = 0;
3722			page.accessed = false;
3723			page.modified = false;
3724
3725			freePages.InsertBefore(freePage, &page);
3726			freedCachedPages++;
3727			cachedPages--;
3728			nextIndex++;
3729		}
3730
3731		// If we have freed cached pages, we need to balance things.
3732		if (freedCachedPages > 0)
3733			unreserve_pages(freedCachedPages);
3734
3735		if (nextIndex - start < length) {
3736			// failed to allocate all cached pages -- free all that we've got
3737			freeClearQueueLocker.Lock();
3738			allocate_page_run_cleanup(freePages, clearPages);
3739			freeClearQueueLocker.Unlock();
3740
3741			return nextIndex - start;
3742		}
3743	}
3744
3745	// clear pages, if requested
3746	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3747		for (VMPageQueue::PageList::Iterator it = freePages.GetIterator();
3748				vm_page* page = it.Next();) {
3749 			clear_page(page);
3750		}
3751	}
3752
3753	// add pages to target queue
3754	if (pageState < PAGE_STATE_FIRST_UNQUEUED) {
3755		freePages.MoveFrom(&clearPages);
3756		sPageQueues[pageState].AppendUnlocked(freePages, length);
3757	}
3758
3759	// Note: We don't unreserve the pages since we pulled them out of the
3760	// free/clear queues without adjusting sUnreservedFreePages.
3761
3762#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3763	AbstractTraceEntryWithStackTrace* traceEntry
3764		= TA(AllocatePageRun(start, length));
3765
3766	for (page_num_t i = start; i < start + length; i++)
3767		sPages[i].allocation_tracking_info.Init(traceEntry);
3768#else
3769	TA(AllocatePageRun(start, length));
3770#endif
3771
3772	return length;
3773}
3774
3775
3776/*! Allocate a physically contiguous range of pages.
3777
3778	\param flags Page allocation flags. Encodes the state the function shall
3779		set the allocated pages to, whether the pages shall be marked busy
3780		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3781		(VM_PAGE_ALLOC_CLEAR).
3782	\param length The number of contiguous pages to allocate.
3783	\param restrictions Restrictions to the physical addresses of the page run
3784		to allocate, including \c low_address, the first acceptable physical
3785		address where the page run may start, \c high_address, the last
3786		acceptable physical address where the page run may end (i.e. it must
3787		hold \code runStartAddress + length <= high_address \endcode),
3788		\c alignment, the alignment of the page run start address, and
3789		\c boundary, multiples of which the page run must not cross.
3790		Values set to \c 0 are ignored.
3791	\param priority The page reservation priority (as passed to
3792		vm_page_reserve_pages()).
3793	\return The first page of the allocated page run on success; \c NULL
3794		when the allocation failed.
3795*/
3796vm_page*
3797vm_page_allocate_page_run(uint32 flags, page_num_t length,
3798	const physical_address_restrictions* restrictions, int priority)
3799{
3800	// compute start and end page index
3801	page_num_t requestedStart
3802		= std::max(restrictions->low_address / B_PAGE_SIZE, sPhysicalPageOffset)
3803			- sPhysicalPageOffset;
3804	page_num_t start = requestedStart;
3805	page_num_t end;
3806	if (restrictions->high_address > 0) {
3807		end = std::max(restrictions->high_address / B_PAGE_SIZE,
3808				sPhysicalPageOffset)
3809			- sPhysicalPageOffset;
3810		end = std::min(end, sNumPages);
3811	} else
3812		end = sNumPages;
3813
3814	// compute alignment mask
3815	page_num_t alignmentMask
3816		= std::max(restrictions->alignment / B_PAGE_SIZE, (phys_addr_t)1) - 1;
3817	ASSERT(((alignmentMask + 1) & alignmentMask) == 0);
3818		// alignment must be a power of 2
3819
3820	// compute the boundary mask
3821	uint32 boundaryMask = 0;
3822	if (restrictions->boundary != 0) {
3823		page_num_t boundary = restrictions->boundary / B_PAGE_SIZE;
3824		// boundary must be a power of two and not less than alignment and
3825		// length
3826		ASSERT(((boundary - 1) & boundary) == 0);
3827		ASSERT(boundary >= alignmentMask + 1);
3828		ASSERT(boundary >= length);
3829
3830		boundaryMask = -boundary;
3831	}
3832
3833	vm_page_reservation reservation;
3834	vm_page_reserve_pages(&reservation, length, priority);
3835
3836	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3837
3838	// First we try to get a run with free pages only. If that fails, we also
3839	// consider cached pages. If there are only few free pages and many cached
3840	// ones, the odds are that we won't find enough contiguous ones, so we skip
3841	// the first iteration in this case.
3842	int32 freePages = sUnreservedFreePages;
3843	int useCached = freePages > 0 && (page_num_t)freePages > 2 * length ? 0 : 1;
3844
3845	for (;;) {
3846		if (alignmentMask != 0 || boundaryMask != 0) {
3847			page_num_t offsetStart = start + sPhysicalPageOffset;
3848
3849			// enforce alignment
3850			if ((offsetStart & alignmentMask) != 0)
3851				offsetStart = (offsetStart + alignmentMask) & ~alignmentMask;
3852
3853			// enforce boundary
3854			if (boundaryMask != 0 && ((offsetStart ^ (offsetStart
3855				+ length - 1)) & boundaryMask) != 0) {
3856				offsetStart = (offsetStart + length - 1) & boundaryMask;
3857			}
3858
3859			start = offsetStart - sPhysicalPageOffset;
3860		}
3861
3862		if (start + length > end) {
3863			if (useCached == 0) {
3864				// The first iteration with free pages only was unsuccessful.
3865				// Try again also considering cached pages.
3866				useCached = 1;
3867				start = requestedStart;
3868				continue;
3869			}
3870
3871			dprintf("vm_page_allocate_page_run(): Failed to allocate run of "
3872				"length %" B_PRIuPHYSADDR " (%" B_PRIuPHYSADDR " %"
3873				B_PRIuPHYSADDR ") in second iteration (align: %" B_PRIuPHYSADDR
3874				" boundary: %" B_PRIuPHYSADDR ")!\n", length, requestedStart,
3875				end, restrictions->alignment, restrictions->boundary);
3876
3877			freeClearQueueLocker.Unlock();
3878			vm_page_unreserve_pages(&reservation);
3879			return NULL;
3880		}
3881
3882		bool foundRun = true;
3883		page_num_t i;
3884		for (i = 0; i < length; i++) {
3885			uint32 pageState = sPages[start + i].State();
3886			if (pageState != PAGE_STATE_FREE
3887				&& pageState != PAGE_STATE_CLEAR
3888				&& (pageState != PAGE_STATE_CACHED || useCached == 0)) {
3889				foundRun = false;
3890				break;
3891			}
3892		}
3893
3894		if (foundRun) {
3895			i = allocate_page_run(start, length, flags, freeClearQueueLocker);
3896			if (i == length)
3897				return &sPages[start];
3898
3899			// apparently a cached page couldn't be allocated -- skip it and
3900			// continue
3901			freeClearQueueLocker.Lock();
3902		}
3903
3904		start += i + 1;
3905	}
3906}
3907
3908
3909vm_page *
3910vm_page_at_index(int32 index)
3911{
3912	return &sPages[index];
3913}
3914
3915
3916vm_page *
3917vm_lookup_page(page_num_t pageNumber)
3918{
3919	if (pageNumber < sPhysicalPageOffset)
3920		return NULL;
3921
3922	pageNumber -= sPhysicalPageOffset;
3923	if (pageNumber >= sNumPages)
3924		return NULL;
3925
3926	return &sPages[pageNumber];
3927}
3928
3929
3930bool
3931vm_page_is_dummy(struct vm_page *page)
3932{
3933	return page < sPages || page >= sPages + sNumPages;
3934}
3935
3936
3937/*!	Free the page that belonged to a certain cache.
3938	You can use vm_page_set_state() manually if you prefer, but only
3939	if the page does not equal PAGE_STATE_MODIFIED.
3940
3941	\param cache The cache the page was previously owned by or NULL. The page
3942		must have been removed from its cache before calling this method in
3943		either case.
3944	\param page The page to free.
3945	\param reservation If not NULL, the page count of the reservation will be
3946		incremented, thus allowing to allocate another page for the freed one at
3947		a later time.
3948*/
3949void
3950vm_page_free_etc(VMCache* cache, vm_page* page,
3951	vm_page_reservation* reservation)
3952{
3953	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3954		&& page->State() != PAGE_STATE_CLEAR);
3955
3956	if (page->State() == PAGE_STATE_MODIFIED && cache->temporary)
3957		atomic_add(&sModifiedTemporaryPages, -1);
3958
3959	free_page(page, false);
3960	if (reservation == NULL)
3961		unreserve_pages(1);
3962}
3963
3964
3965void
3966vm_page_set_state(vm_page *page, int pageState)
3967{
3968	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3969		&& page->State() != PAGE_STATE_CLEAR);
3970
3971	if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
3972		free_page(page, pageState == PAGE_STATE_CLEAR);
3973		unreserve_pages(1);
3974	} else
3975		set_page_state(page, pageState);
3976}
3977
3978
3979/*!	Moves a page to either the tail of the head of its current queue,
3980	depending on \a tail.
3981	The page must have a cache and the cache must be locked!
3982*/
3983void
3984vm_page_requeue(struct vm_page *page, bool tail)
3985{
3986	PAGE_ASSERT(page, page->Cache() != NULL);
3987	page->Cache()->AssertLocked();
3988	// DEBUG_PAGE_ACCESS_CHECK(page);
3989		// TODO: This assertion cannot be satisfied by idle_scan_active_pages()
3990		// when it requeues busy pages. The reason is that vm_soft_fault()
3991		// (respectively fault_get_page()) and the file cache keep newly
3992		// allocated pages accessed while they are reading them from disk. It
3993		// would probably be better to change that code and reenable this
3994		// check.
3995
3996	VMPageQueue *queue = NULL;
3997
3998	switch (page->State()) {
3999		case PAGE_STATE_ACTIVE:
4000			queue = &sActivePageQueue;
4001			break;
4002		case PAGE_STATE_INACTIVE:
4003			queue = &sInactivePageQueue;
4004			break;
4005		case PAGE_STATE_MODIFIED:
4006			queue = &sModifiedPageQueue;
4007			break;
4008		case PAGE_STATE_CACHED:
4009			queue = &sCachedPageQueue;
4010			break;
4011		case PAGE_STATE_FREE:
4012		case PAGE_STATE_CLEAR:
4013			panic("vm_page_requeue() called for free/clear page %p", page);
4014			return;
4015		case PAGE_STATE_WIRED:
4016		case PAGE_STATE_UNUSED:
4017			return;
4018		default:
4019			panic("vm_page_touch: vm_page %p in invalid state %d\n",
4020				page, page->State());
4021			break;
4022	}
4023
4024	queue->RequeueUnlocked(page, tail);
4025}
4026
4027
4028page_num_t
4029vm_page_num_pages(void)
4030{
4031	return sNumPages - sNonExistingPages;
4032}
4033
4034
4035/*! There is a subtle distinction between the page counts returned by
4036	this function and vm_page_num_free_pages():
4037	The latter returns the number of pages that are completely uncommitted,
4038	whereas this one returns the number of pages that are available for
4039	use by being reclaimed as well (IOW it factors in things like cache pages
4040	as available).
4041*/
4042page_num_t
4043vm_page_num_available_pages(void)
4044{
4045	return vm_available_memory() / B_PAGE_SIZE;
4046}
4047
4048
4049page_num_t
4050vm_page_num_free_pages(void)
4051{
4052	int32 count = sUnreservedFreePages + sCachedPageQueue.Count();
4053	return count > 0 ? count : 0;
4054}
4055
4056
4057page_num_t
4058vm_page_num_unused_pages(void)
4059{
4060	int32 count = sUnreservedFreePages;
4061	return count > 0 ? count : 0;
4062}
4063
4064
4065void
4066vm_page_get_stats(system_info *info)
4067{
4068	// Note: there's no locking protecting any of the queues or counters here,
4069	// so we run the risk of getting bogus values when evaluating them
4070	// throughout this function. As these stats are for informational purposes
4071	// only, it is not really worth introducing such locking. Therefore we just
4072	// ensure that we don't under- or overflow any of the values.
4073
4074	// The pages used for the block cache buffers. Those should not be counted
4075	// as used but as cached pages.
4076	// TODO: We should subtract the blocks that are in use ATM, since those
4077	// can't really be freed in a low memory situation.
4078	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
4079	info->block_cache_pages = blockCachePages;
4080
4081	// Non-temporary modified pages are special as they represent pages that
4082	// can be written back, so they could be freed if necessary, for us
4083	// basically making them into cached pages with a higher overhead. The
4084	// modified queue count is therefore split into temporary and non-temporary
4085	// counts that are then added to the corresponding number.
4086	page_num_t modifiedNonTemporaryPages
4087		= (sModifiedPageQueue.Count() - sModifiedTemporaryPages);
4088
4089	info->max_pages = vm_page_num_pages();
4090	info->cached_pages = sCachedPageQueue.Count() + modifiedNonTemporaryPages
4091		+ blockCachePages;
4092
4093	// max_pages is composed of:
4094	//	active + inactive + unused + wired + modified + cached + free + clear
4095	// So taking out the cached (including modified non-temporary), free and
4096	// clear ones leaves us with all used pages.
4097	uint32 subtractPages = info->cached_pages + sFreePageQueue.Count()
4098		+ sClearPageQueue.Count();
4099	info->used_pages = subtractPages > info->max_pages
4100		? 0 : info->max_pages - subtractPages;
4101
4102	if (info->used_pages + info->cached_pages > info->max_pages) {
4103		// Something was shuffled around while we were summing up the counts.
4104		// Make the values sane, preferring the worse case of more used pages.
4105		info->cached_pages = info->max_pages - info->used_pages;
4106	}
4107
4108	info->page_faults = vm_num_page_faults();
4109	info->ignored_pages = sIgnoredPages;
4110
4111	// TODO: We don't consider pages used for page directories/tables yet.
4112}
4113
4114
4115/*!	Returns the greatest address within the last page of accessible physical
4116	memory.
4117	The value is inclusive, i.e. in case of a 32 bit phys_addr_t 0xffffffff
4118	means the that the last page ends at exactly 4 GB.
4119*/
4120phys_addr_t
4121vm_page_max_address()
4122{
4123	return ((phys_addr_t)sPhysicalPageOffset + sNumPages) * B_PAGE_SIZE - 1;
4124}
4125
4126
4127RANGE_MARKER_FUNCTION_END(vm_page)
4128