1/*
2 * Copyright 2004-2012, Axel D��rfler, axeld@pinc-software.de.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7#include <block_cache.h>
8
9#include <unistd.h>
10#include <stdlib.h>
11#include <string.h>
12#include <errno.h>
13
14#include <KernelExport.h>
15#include <fs_cache.h>
16
17#include <condition_variable.h>
18#include <lock.h>
19#include <low_resource_manager.h>
20#include <slab/Slab.h>
21#include <tracing.h>
22#include <util/kernel_cpp.h>
23#include <util/DoublyLinkedList.h>
24#include <util/AutoLock.h>
25#include <vm/vm_page.h>
26
27#include "kernel_debug_config.h"
28
29
30// TODO: this is a naive but growing implementation to test the API:
31//	block reading/writing is not at all optimized for speed, it will
32//	just read and write single blocks.
33// TODO: the retrieval/copy of the original data could be delayed until the
34//		new data must be written, ie. in low memory situations.
35
36//#define TRACE_BLOCK_CACHE
37#ifdef TRACE_BLOCK_CACHE
38#	define TRACE(x)	dprintf x
39#else
40#	define TRACE(x) ;
41#endif
42
43#define TRACE_ALWAYS(x) dprintf x
44
45// This macro is used for fatal situations that are acceptable in a running
46// system, like out of memory situations - should only panic for debugging.
47#define FATAL(x) panic x
48
49static const bigtime_t kTransactionIdleTime = 2000000LL;
50	// a transaction is considered idle after 2 seconds of inactivity
51
52
53namespace {
54
55struct cache_transaction;
56struct cached_block;
57struct block_cache;
58typedef DoublyLinkedListLink<cached_block> block_link;
59
60struct cached_block {
61	cached_block*	next;			// next in hash
62	cached_block*	transaction_next;
63	block_link		link;
64	off_t			block_number;
65	void*			current_data;
66		// The data that is seen by everyone using the API; this one is always
67		// present.
68	void*			original_data;
69		// When in a transaction, this contains the original data from before
70		// the transaction.
71	void*			parent_data;
72		// This is a lazily alloced buffer that represents the contents of the
73		// block in the parent transaction. It may point to current_data if the
74		// contents have been changed only in the parent transaction, or, if the
75		// block has been changed in the current sub transaction already, to a
76		// new block containing the contents changed in the parent transaction.
77		// If this is NULL, the block has not been changed in the parent
78		// transaction at all.
79#if BLOCK_CACHE_DEBUG_CHANGED
80	void*			compare;
81#endif
82	int32			ref_count;
83	int32			last_accessed;
84	bool			busy_reading : 1;
85	bool			busy_writing : 1;
86	bool			is_writing : 1;
87		// Block has been checked out for writing without transactions, and
88		// cannot be written back if set
89	bool			is_dirty : 1;
90	bool			unused : 1;
91	bool			discard : 1;
92	bool			busy_reading_waiters : 1;
93	bool			busy_writing_waiters : 1;
94	cache_transaction* transaction;
95		// This is the current active transaction, if any, the block is
96		// currently in (meaning was changed as a part of it).
97	cache_transaction* previous_transaction;
98		// This is set to the last transaction that was ended containing this
99		// block. In this case, the block has not yet written back yet, and
100		// the changed data is either in current_data, or original_data -- the
101		// latter if the block is already being part of another transaction.
102		// There can only be one previous transaction, so when the active
103		// transaction ends, the changes of the previous transaction have to
104		// be written back before that transaction becomes the next previous
105		// transaction.
106
107	bool CanBeWritten() const;
108	int32 LastAccess() const
109		{ return system_time() / 1000000L - last_accessed; }
110};
111
112typedef DoublyLinkedList<cached_block,
113	DoublyLinkedListMemberGetLink<cached_block,
114		&cached_block::link> > block_list;
115
116struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
117	static inline void* operator new(size_t size);
118	static inline void operator delete(void* block);
119
120	int32			transaction_id;
121	int32			events_pending;
122	int32			events;
123	transaction_notification_hook hook;
124	void*			data;
125	bool			delete_after_event;
126};
127
128typedef DoublyLinkedList<cache_notification> NotificationList;
129
130static object_cache* sCacheNotificationCache;
131
132struct cache_listener;
133typedef DoublyLinkedListLink<cache_listener> listener_link;
134
135struct cache_listener : cache_notification {
136	listener_link	link;
137};
138
139typedef DoublyLinkedList<cache_listener,
140	DoublyLinkedListMemberGetLink<cache_listener,
141		&cache_listener::link> > ListenerList;
142
143void*
144cache_notification::operator new(size_t size)
145{
146	// We can't really know whether something is a cache_notification or a
147	// cache_listener at runtime, so we just use one object_cache for both
148	// with the size set to that of the (slightly larger) cache_listener.
149	// In practice, the vast majority of cache_notifications are really
150	// cache_listeners, so this is a more than acceptable trade-off.
151	ASSERT(size <= sizeof(cache_listener));
152	return object_cache_alloc(sCacheNotificationCache, 0);
153}
154
155void
156cache_notification::operator delete(void* block)
157{
158	object_cache_free(sCacheNotificationCache, block, 0);
159}
160
161
162struct BlockHash {
163	typedef off_t			KeyType;
164	typedef	cached_block	ValueType;
165
166	size_t HashKey(KeyType key) const
167	{
168		return key;
169	}
170
171	size_t Hash(ValueType* block) const
172	{
173		return block->block_number;
174	}
175
176	bool Compare(KeyType key, ValueType* block) const
177	{
178		return block->block_number == key;
179	}
180
181	ValueType*& GetLink(ValueType* value) const
182	{
183		return value->next;
184	}
185};
186
187typedef BOpenHashTable<BlockHash> BlockTable;
188
189
190struct TransactionHash {
191	typedef int32				KeyType;
192	typedef	cache_transaction	ValueType;
193
194	size_t HashKey(KeyType key) const
195	{
196		return key;
197	}
198
199	size_t Hash(ValueType* transaction) const;
200	bool Compare(KeyType key, ValueType* transaction) const;
201	ValueType*& GetLink(ValueType* value) const;
202};
203
204typedef BOpenHashTable<TransactionHash> TransactionTable;
205
206
207struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
208	BlockTable*		hash;
209	mutex			lock;
210	int				fd;
211	off_t			max_blocks;
212	size_t			block_size;
213	int32			next_transaction_id;
214	cache_transaction* last_transaction;
215	TransactionTable* transaction_hash;
216
217	object_cache*	buffer_cache;
218	block_list		unused_blocks;
219	uint32			unused_block_count;
220
221	ConditionVariable busy_reading_condition;
222	uint32			busy_reading_count;
223	bool			busy_reading_waiters;
224
225	ConditionVariable busy_writing_condition;
226	uint32			busy_writing_count;
227	bool			busy_writing_waiters;
228
229	bigtime_t		last_block_write;
230	bigtime_t		last_block_write_duration;
231
232	uint32			num_dirty_blocks;
233	bool			read_only;
234
235	NotificationList pending_notifications;
236	ConditionVariable condition_variable;
237
238					block_cache(int fd, off_t numBlocks, size_t blockSize,
239						bool readOnly);
240					~block_cache();
241
242	status_t		Init();
243
244	void			Free(void* buffer);
245	void*			Allocate();
246	void			FreeBlock(cached_block* block);
247	cached_block*	NewBlock(off_t blockNumber);
248	void			FreeBlockParentData(cached_block* block);
249
250	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
251	void			RemoveBlock(cached_block* block);
252	void			DiscardBlock(cached_block* block);
253
254private:
255	static void		_LowMemoryHandler(void* data, uint32 resources,
256						int32 level);
257	cached_block*	_GetUnusedBlock();
258};
259
260struct cache_transaction {
261	cache_transaction();
262
263	cache_transaction* next;
264	int32			id;
265	int32			num_blocks;
266	int32			main_num_blocks;
267	int32			sub_num_blocks;
268	cached_block*	first_block;
269	block_list		blocks;
270	ListenerList	listeners;
271	bool			open;
272	bool			has_sub_transaction;
273	bigtime_t		last_used;
274	int32			busy_writing_count;
275};
276
277
278class BlockWriter {
279public:
280								BlockWriter(block_cache* cache,
281									size_t max = SIZE_MAX);
282								~BlockWriter();
283
284			bool				Add(cached_block* block,
285									cache_transaction* transaction = NULL);
286			bool				Add(cache_transaction* transaction,
287									bool& hasLeftOvers);
288
289			status_t			Write(cache_transaction* transaction = NULL,
290									bool canUnlock = true);
291
292			bool				DeletedTransaction() const
293									{ return fDeletedTransaction; }
294
295	static	status_t			WriteBlock(block_cache* cache,
296									cached_block* block);
297
298private:
299			void*				_Data(cached_block* block) const;
300			status_t			_WriteBlock(cached_block* block);
301			void				_BlockDone(cached_block* block,
302									cache_transaction* transaction);
303			void				_UnmarkWriting(cached_block* block);
304
305	static	int					_CompareBlocks(const void* _blockA,
306									const void* _blockB);
307
308private:
309	static	const size_t		kBufferSize = 64;
310
311			block_cache*		fCache;
312			cached_block*		fBuffer[kBufferSize];
313			cached_block**		fBlocks;
314			size_t				fCount;
315			size_t				fTotal;
316			size_t				fCapacity;
317			size_t				fMax;
318			status_t			fStatus;
319			bool				fDeletedTransaction;
320};
321
322
323class TransactionLocking {
324public:
325	inline bool Lock(block_cache* cache)
326	{
327		mutex_lock(&cache->lock);
328
329		while (cache->busy_writing_count != 0) {
330			// wait for all blocks to be written
331			ConditionVariableEntry entry;
332			cache->busy_writing_condition.Add(&entry);
333			cache->busy_writing_waiters = true;
334
335			mutex_unlock(&cache->lock);
336
337			entry.Wait();
338
339			mutex_lock(&cache->lock);
340		}
341
342		return true;
343	}
344
345	inline void Unlock(block_cache* cache)
346	{
347		mutex_unlock(&cache->lock);
348	}
349};
350
351typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
352
353} // namespace
354
355
356#if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
357namespace BlockTracing {
358
359class Action : public AbstractTraceEntry {
360public:
361	Action(block_cache* cache, cached_block* block)
362		:
363		fCache(cache),
364		fBlockNumber(block->block_number),
365		fIsDirty(block->is_dirty),
366		fHasOriginal(block->original_data != NULL),
367		fHasParent(block->parent_data != NULL),
368		fTransactionID(-1),
369		fPreviousID(-1)
370	{
371		if (block->transaction != NULL)
372			fTransactionID = block->transaction->id;
373		if (block->previous_transaction != NULL)
374			fPreviousID = block->previous_transaction->id;
375	}
376
377	virtual void AddDump(TraceOutput& out)
378	{
379		out.Print("block cache %p, %s %" B_PRIu64 ", %c%c%c transaction %" B_PRId32
380			" (previous id %" B_PRId32 ")\n", fCache, _Action(), fBlockNumber,
381			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
382			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
383	}
384
385	virtual const char* _Action() const = 0;
386
387private:
388	block_cache*		fCache;
389	uint64				fBlockNumber;
390	bool				fIsDirty;
391	bool				fHasOriginal;
392	bool				fHasParent;
393	int32				fTransactionID;
394	int32				fPreviousID;
395};
396
397class Get : public Action {
398public:
399	Get(block_cache* cache, cached_block* block)
400		:
401		Action(cache, block)
402	{
403		Initialized();
404	}
405
406	virtual const char* _Action() const { return "get"; }
407};
408
409class Put : public Action {
410public:
411	Put(block_cache* cache, cached_block* block)
412		:
413		Action(cache, block)
414	{
415		Initialized();
416	}
417
418	virtual const char* _Action() const { return "put"; }
419};
420
421class Read : public Action {
422public:
423	Read(block_cache* cache, cached_block* block)
424		:
425		Action(cache, block)
426	{
427		Initialized();
428	}
429
430	virtual const char* _Action() const { return "read"; }
431};
432
433class Write : public Action {
434public:
435	Write(block_cache* cache, cached_block* block)
436		:
437		Action(cache, block)
438	{
439		Initialized();
440	}
441
442	virtual const char* _Action() const { return "write"; }
443};
444
445class Flush : public Action {
446public:
447	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
448		:
449		Action(cache, block),
450		fGetUnused(getUnused)
451	{
452		Initialized();
453	}
454
455	virtual const char* _Action() const
456		{ return fGetUnused ? "get-unused" : "flush"; }
457
458private:
459	bool	fGetUnused;
460};
461
462class Error : public AbstractTraceEntry {
463public:
464	Error(block_cache* cache, uint64 blockNumber, const char* message,
465			status_t status = B_OK)
466		:
467		fCache(cache),
468		fBlockNumber(blockNumber),
469		fMessage(message),
470		fStatus(status)
471	{
472		Initialized();
473	}
474
475	virtual void AddDump(TraceOutput& out)
476	{
477		out.Print("block cache %p, error %" B_PRIu64 ", %s%s%s",
478			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
479			fStatus != B_OK ? strerror(fStatus) : "");
480	}
481
482private:
483	block_cache*	fCache;
484	uint64			fBlockNumber;
485	const char*		fMessage;
486	status_t		fStatus;
487};
488
489#if BLOCK_CACHE_BLOCK_TRACING >= 2
490class BlockData : public AbstractTraceEntry {
491public:
492	enum {
493		kCurrent	= 0x01,
494		kParent		= 0x02,
495		kOriginal	= 0x04
496	};
497
498	BlockData(block_cache* cache, cached_block* block, const char* message)
499		:
500		fCache(cache),
501		fSize(cache->block_size),
502		fBlockNumber(block->block_number),
503		fMessage(message)
504	{
505		_Allocate(fCurrent, block->current_data);
506		_Allocate(fParent, block->parent_data);
507		_Allocate(fOriginal, block->original_data);
508
509#if KTRACE_PRINTF_STACK_TRACE
510		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
511			false);
512#endif
513
514		Initialized();
515	}
516
517	virtual void AddDump(TraceOutput& out)
518	{
519		out.Print("block cache %p, block %" B_PRIu64 ", data %c%c%c: %s",
520			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
521			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
522			fMessage);
523	}
524
525#if KTRACE_PRINTF_STACK_TRACE
526	virtual void DumpStackTrace(TraceOutput& out)
527	{
528		out.PrintStackTrace(fStackTrace);
529	}
530#endif
531
532	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
533	{
534		if ((which & kCurrent) != 0)
535			DumpBlock(kCurrent, offset, size);
536		if ((which & kParent) != 0)
537			DumpBlock(kParent, offset, size);
538		if ((which & kOriginal) != 0)
539			DumpBlock(kOriginal, offset, size);
540	}
541
542	void DumpBlock(uint32 which, uint32 offset, uint32 size)
543	{
544		if (offset > fSize) {
545			kprintf("invalid offset (block size %" B_PRIu32 ")\n", fSize);
546			return;
547		}
548		if (offset + size > fSize)
549			size = fSize - offset;
550
551		const char* label;
552		uint8* data;
553
554		if ((which & kCurrent) != 0) {
555			label = "current";
556			data = fCurrent;
557		} else if ((which & kParent) != 0) {
558			label = "parent";
559			data = fParent;
560		} else if ((which & kOriginal) != 0) {
561			label = "original";
562			data = fOriginal;
563		} else
564			return;
565
566		kprintf("%s: offset %" B_PRIu32 ", %" B_PRIu32 " bytes\n", label, offset, size);
567
568		static const uint32 kBlockSize = 16;
569		data += offset;
570
571		for (uint32 i = 0; i < size;) {
572			int start = i;
573
574			kprintf("  %04" B_PRIx32 " ", i);
575			for (; i < start + kBlockSize; i++) {
576				if (!(i % 4))
577					kprintf(" ");
578
579				if (i >= size)
580					kprintf("  ");
581				else
582					kprintf("%02x", data[i]);
583			}
584
585			kprintf("\n");
586		}
587	}
588
589private:
590	void _Allocate(uint8*& target, void* source)
591	{
592		if (source == NULL) {
593			target = NULL;
594			return;
595		}
596
597		target = alloc_tracing_buffer_memcpy(source, fSize, false);
598	}
599
600	block_cache*	fCache;
601	uint32			fSize;
602	uint64			fBlockNumber;
603	const char*		fMessage;
604	uint8*			fCurrent;
605	uint8*			fParent;
606	uint8*			fOriginal;
607#if KTRACE_PRINTF_STACK_TRACE
608	tracing_stack_trace* fStackTrace;
609#endif
610};
611#endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
612
613}	// namespace BlockTracing
614
615#	define TB(x) new(std::nothrow) BlockTracing::x;
616#else
617#	define TB(x) ;
618#endif
619
620#if BLOCK_CACHE_BLOCK_TRACING >= 2
621#	define TB2(x) new(std::nothrow) BlockTracing::x;
622#else
623#	define TB2(x) ;
624#endif
625
626
627#if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
628namespace TransactionTracing {
629
630class Action : public AbstractTraceEntry {
631public:
632	Action(const char* label, block_cache* cache,
633			cache_transaction* transaction)
634		:
635		fCache(cache),
636		fTransaction(transaction),
637		fID(transaction->id),
638		fSub(transaction->has_sub_transaction),
639		fNumBlocks(transaction->num_blocks),
640		fSubNumBlocks(transaction->sub_num_blocks)
641	{
642		strlcpy(fLabel, label, sizeof(fLabel));
643		Initialized();
644	}
645
646	virtual void AddDump(TraceOutput& out)
647	{
648		out.Print("block cache %p, %s transaction %p (id %" B_PRId32 ")%s"
649			", %" B_PRId32 "/%" B_PRId32 " blocks", fCache, fLabel, fTransaction,
650			fID, fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
651	}
652
653private:
654	char				fLabel[12];
655	block_cache*		fCache;
656	cache_transaction*	fTransaction;
657	int32				fID;
658	bool				fSub;
659	int32				fNumBlocks;
660	int32				fSubNumBlocks;
661};
662
663class Detach : public AbstractTraceEntry {
664public:
665	Detach(block_cache* cache, cache_transaction* transaction,
666			cache_transaction* newTransaction)
667		:
668		fCache(cache),
669		fTransaction(transaction),
670		fID(transaction->id),
671		fSub(transaction->has_sub_transaction),
672		fNewTransaction(newTransaction),
673		fNewID(newTransaction->id)
674	{
675		Initialized();
676	}
677
678	virtual void AddDump(TraceOutput& out)
679	{
680		out.Print("block cache %p, detach transaction %p (id %" B_PRId32 ")"
681			"from transaction %p (id %" B_PRId32 ")%s",
682			fCache, fNewTransaction, fNewID, fTransaction, fID,
683			fSub ? " sub" : "");
684	}
685
686private:
687	block_cache*		fCache;
688	cache_transaction*	fTransaction;
689	int32				fID;
690	bool				fSub;
691	cache_transaction*	fNewTransaction;
692	int32				fNewID;
693};
694
695class Abort : public AbstractTraceEntry {
696public:
697	Abort(block_cache* cache, cache_transaction* transaction)
698		:
699		fCache(cache),
700		fTransaction(transaction),
701		fID(transaction->id),
702		fNumBlocks(0)
703	{
704		bool isSub = transaction->has_sub_transaction;
705		fNumBlocks = isSub ? transaction->sub_num_blocks
706			: transaction->num_blocks;
707		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
708		if (fBlocks != NULL) {
709			cached_block* block = transaction->first_block;
710			for (int32 i = 0; block != NULL && i < fNumBlocks;
711					block = block->transaction_next) {
712				fBlocks[i++] = block->block_number;
713			}
714		} else
715			fNumBlocks = 0;
716
717#if KTRACE_PRINTF_STACK_TRACE
718		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
719			false);
720#endif
721
722		Initialized();
723	}
724
725	virtual void AddDump(TraceOutput& out)
726	{
727		out.Print("block cache %p, abort transaction "
728			"%p (id %" B_PRId32 "), blocks", fCache, fTransaction, fID);
729		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
730			out.Print(" %" B_PRIdOFF, fBlocks[i]);
731	}
732
733#if KTRACE_PRINTF_STACK_TRACE
734	virtual void DumpStackTrace(TraceOutput& out)
735	{
736		out.PrintStackTrace(fStackTrace);
737	}
738#endif
739
740private:
741	block_cache*		fCache;
742	cache_transaction*	fTransaction;
743	int32				fID;
744	off_t*				fBlocks;
745	int32				fNumBlocks;
746#if KTRACE_PRINTF_STACK_TRACE
747	tracing_stack_trace* fStackTrace;
748#endif
749};
750
751}	// namespace TransactionTracing
752
753#	define T(x) new(std::nothrow) TransactionTracing::x;
754#else
755#	define T(x) ;
756#endif
757
758
759static DoublyLinkedList<block_cache> sCaches;
760static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
761static mutex sCachesMemoryUseLock
762	= MUTEX_INITIALIZER("block caches memory use");
763static size_t sUsedMemory;
764static sem_id sEventSemaphore;
765static mutex sNotificationsLock
766	= MUTEX_INITIALIZER("block cache notifications");
767static thread_id sNotifierWriterThread;
768static DoublyLinkedListLink<block_cache> sMarkCache;
769	// TODO: this only works if the link is the first entry of block_cache
770static object_cache* sBlockCache;
771
772
773//	#pragma mark - notifications/listener
774
775
776/*!	Checks whether or not this is an event that closes a transaction. */
777static inline bool
778is_closing_event(int32 event)
779{
780	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
781}
782
783
784static inline bool
785is_written_event(int32 event)
786{
787	return (event & TRANSACTION_WRITTEN) != 0;
788}
789
790
791/*!	From the specified \a notification, it will remove the lowest pending
792	event, and return that one in \a _event.
793	If there is no pending event anymore, it will return \c false.
794*/
795static bool
796get_next_pending_event(cache_notification* notification, int32* _event)
797{
798	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
799		int32 pending = atomic_and(&notification->events_pending,
800			~eventMask);
801
802		bool more = (pending & ~eventMask) != 0;
803
804		if ((pending & eventMask) != 0) {
805			*_event = eventMask;
806			return more;
807		}
808	}
809
810	return false;
811}
812
813
814static void
815flush_pending_notifications(block_cache* cache)
816{
817	ASSERT_LOCKED_MUTEX(&sCachesLock);
818
819	while (true) {
820		MutexLocker locker(sNotificationsLock);
821
822		cache_notification* notification = cache->pending_notifications.Head();
823		if (notification == NULL)
824			return;
825
826		bool deleteAfterEvent = false;
827		int32 event = -1;
828		if (!get_next_pending_event(notification, &event)) {
829			// remove the notification if this was the last pending event
830			cache->pending_notifications.Remove(notification);
831			deleteAfterEvent = notification->delete_after_event;
832		}
833
834		if (event >= 0) {
835			// Notify listener, we need to copy the notification, as it might
836			// be removed when we unlock the list.
837			cache_notification copy = *notification;
838			locker.Unlock();
839
840			copy.hook(copy.transaction_id, event, copy.data);
841
842			locker.Lock();
843		}
844
845		if (deleteAfterEvent)
846			delete notification;
847	}
848}
849
850
851/*!	Flushes all pending notifications by calling the appropriate hook
852	functions.
853	Must not be called with a cache lock held.
854*/
855static void
856flush_pending_notifications()
857{
858	MutexLocker _(sCachesLock);
859
860	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
861	while (iterator.HasNext()) {
862		block_cache* cache = iterator.Next();
863
864		flush_pending_notifications(cache);
865	}
866}
867
868
869/*!	Initializes the \a notification as specified. */
870static void
871set_notification(cache_transaction* transaction,
872	cache_notification &notification, int32 events,
873	transaction_notification_hook hook, void* data)
874{
875	notification.transaction_id = transaction != NULL ? transaction->id : -1;
876	notification.events_pending = 0;
877	notification.events = events;
878	notification.hook = hook;
879	notification.data = data;
880	notification.delete_after_event = false;
881}
882
883
884/*!	Makes sure the notification is deleted. It either deletes it directly,
885	when possible, or marks it for deletion if the notification is pending.
886*/
887static void
888delete_notification(cache_notification* notification)
889{
890	MutexLocker locker(sNotificationsLock);
891
892	if (notification->events_pending != 0)
893		notification->delete_after_event = true;
894	else
895		delete notification;
896}
897
898
899/*!	Adds the notification to the pending notifications list, or, if it's
900	already part of it, updates its events_pending field.
901	Also marks the notification to be deleted if \a deleteNotification
902	is \c true.
903	Triggers the notifier thread to run.
904*/
905static void
906add_notification(block_cache* cache, cache_notification* notification,
907	int32 event, bool deleteNotification)
908{
909	if (notification->hook == NULL)
910		return;
911
912	int32 pending = atomic_or(&notification->events_pending, event);
913	if (pending == 0) {
914		// not yet part of the notification list
915		MutexLocker locker(sNotificationsLock);
916		if (deleteNotification)
917			notification->delete_after_event = true;
918		cache->pending_notifications.Add(notification);
919	} else if (deleteNotification) {
920		// we might need to delete it ourselves if we're late
921		delete_notification(notification);
922	}
923
924	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
925		// We're probably still holding some locks that makes rescheduling
926		// not a good idea at this point.
927}
928
929
930/*!	Notifies all interested listeners of this transaction about the \a event.
931	If \a event is a closing event (ie. TRANSACTION_ENDED, and
932	TRANSACTION_ABORTED), all listeners except those listening to
933	TRANSACTION_WRITTEN will be removed.
934*/
935static void
936notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
937	int32 event)
938{
939	T(Action("notify", cache, transaction));
940
941	bool isClosing = is_closing_event(event);
942	bool isWritten = is_written_event(event);
943
944	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
945	while (iterator.HasNext()) {
946		cache_listener* listener = iterator.Next();
947
948		bool remove = (isClosing && !is_written_event(listener->events))
949			|| (isWritten && is_written_event(listener->events));
950		if (remove)
951			iterator.Remove();
952
953		if ((listener->events & event) != 0)
954			add_notification(cache, listener, event, remove);
955		else if (remove)
956			delete_notification(listener);
957	}
958}
959
960
961/*!	Removes and deletes all listeners that are still monitoring this
962	transaction.
963*/
964static void
965remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
966{
967	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
968	while (iterator.HasNext()) {
969		cache_listener* listener = iterator.Next();
970		iterator.Remove();
971
972		delete_notification(listener);
973	}
974}
975
976
977static status_t
978add_transaction_listener(block_cache* cache, cache_transaction* transaction,
979	int32 events, transaction_notification_hook hookFunction, void* data)
980{
981	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
982	while (iterator.HasNext()) {
983		cache_listener* listener = iterator.Next();
984
985		if (listener->data == data && listener->hook == hookFunction) {
986			// this listener already exists, just update it
987			listener->events |= events;
988			return B_OK;
989		}
990	}
991
992	cache_listener* listener = new cache_listener;
993	if (listener == NULL)
994		return B_NO_MEMORY;
995
996	set_notification(transaction, *listener, events, hookFunction, data);
997	transaction->listeners.Add(listener);
998	return B_OK;
999}
1000
1001
1002//	#pragma mark - private transaction
1003
1004
1005cache_transaction::cache_transaction()
1006{
1007	num_blocks = 0;
1008	main_num_blocks = 0;
1009	sub_num_blocks = 0;
1010	first_block = NULL;
1011	open = true;
1012	has_sub_transaction = false;
1013	last_used = system_time();
1014	busy_writing_count = 0;
1015}
1016
1017
1018static void
1019delete_transaction(block_cache* cache, cache_transaction* transaction)
1020{
1021	if (cache->last_transaction == transaction)
1022		cache->last_transaction = NULL;
1023
1024	remove_transaction_listeners(cache, transaction);
1025	delete transaction;
1026}
1027
1028
1029static cache_transaction*
1030lookup_transaction(block_cache* cache, int32 id)
1031{
1032	return cache->transaction_hash->Lookup(id);
1033}
1034
1035
1036size_t TransactionHash::Hash(cache_transaction* transaction) const
1037{
1038	return transaction->id;
1039}
1040
1041
1042bool TransactionHash::Compare(int32 key, cache_transaction* transaction) const
1043{
1044	return transaction->id == key;
1045}
1046
1047
1048cache_transaction*& TransactionHash::GetLink(cache_transaction* value) const
1049{
1050	return value->next;
1051}
1052
1053
1054/*!	Writes back any changes made to blocks in \a transaction that are still
1055	part of a previous transacton.
1056*/
1057static status_t
1058write_blocks_in_previous_transaction(block_cache* cache,
1059	cache_transaction* transaction)
1060{
1061	BlockWriter writer(cache);
1062
1063	cached_block* block = transaction->first_block;
1064	for (; block != NULL; block = block->transaction_next) {
1065		if (block->previous_transaction != NULL) {
1066			// need to write back pending changes
1067			writer.Add(block);
1068		}
1069	}
1070
1071	return writer.Write();
1072}
1073
1074
1075//	#pragma mark - cached_block
1076
1077
1078bool
1079cached_block::CanBeWritten() const
1080{
1081	return !busy_writing && !busy_reading
1082		&& (previous_transaction != NULL
1083			|| (transaction == NULL && is_dirty && !is_writing));
1084}
1085
1086
1087//	#pragma mark - BlockWriter
1088
1089
1090BlockWriter::BlockWriter(block_cache* cache, size_t max)
1091	:
1092	fCache(cache),
1093	fBlocks(fBuffer),
1094	fCount(0),
1095	fTotal(0),
1096	fCapacity(kBufferSize),
1097	fMax(max),
1098	fStatus(B_OK),
1099	fDeletedTransaction(false)
1100{
1101}
1102
1103
1104BlockWriter::~BlockWriter()
1105{
1106	if (fBlocks != fBuffer)
1107		free(fBlocks);
1108}
1109
1110
1111/*!	Adds the specified block to the to be written array. If no more blocks can
1112	be added, false is returned, otherwise true.
1113*/
1114bool
1115BlockWriter::Add(cached_block* block, cache_transaction* transaction)
1116{
1117	ASSERT(block->CanBeWritten());
1118
1119	if (fTotal == fMax)
1120		return false;
1121
1122	if (fCount >= fCapacity) {
1123		// Enlarge array if necessary
1124		cached_block** newBlocks;
1125		size_t newCapacity = max_c(256, fCapacity * 2);
1126		if (fBlocks == fBuffer)
1127			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1128		else {
1129			newBlocks = (cached_block**)realloc(fBlocks,
1130				newCapacity * sizeof(void*));
1131		}
1132
1133		if (newBlocks == NULL) {
1134			// Allocating a larger array failed - we need to write back what
1135			// we have synchronously now (this will also clear the array)
1136			Write(transaction, false);
1137		} else {
1138			if (fBlocks == fBuffer)
1139				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1140
1141			fBlocks = newBlocks;
1142			fCapacity = newCapacity;
1143		}
1144	}
1145
1146	fBlocks[fCount++] = block;
1147	fTotal++;
1148	block->busy_writing = true;
1149	fCache->busy_writing_count++;
1150	if (block->previous_transaction != NULL)
1151		block->previous_transaction->busy_writing_count++;
1152
1153	return true;
1154}
1155
1156
1157/*!	Adds all blocks of the specified transaction to the to be written array.
1158	If no more blocks can be added, false is returned, otherwise true.
1159*/
1160bool
1161BlockWriter::Add(cache_transaction* transaction, bool& hasLeftOvers)
1162{
1163	ASSERT(!transaction->open);
1164
1165	if (transaction->busy_writing_count != 0) {
1166		hasLeftOvers = true;
1167		return true;
1168	}
1169
1170	hasLeftOvers = false;
1171
1172	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1173	while (cached_block* block = blockIterator.Next()) {
1174		if (!block->CanBeWritten()) {
1175			// This block was already part of a previous transaction within this
1176			// writer
1177			hasLeftOvers = true;
1178			continue;
1179		}
1180		if (!Add(block, transaction))
1181			return false;
1182
1183		if (DeletedTransaction())
1184			break;
1185	}
1186
1187	return true;
1188}
1189
1190
1191/*! Cache must be locked when calling this method, but it will be unlocked
1192	while the blocks are written back.
1193*/
1194status_t
1195BlockWriter::Write(cache_transaction* transaction, bool canUnlock)
1196{
1197	if (fCount == 0)
1198		return B_OK;
1199
1200	if (canUnlock)
1201		mutex_unlock(&fCache->lock);
1202
1203	// Sort blocks in their on-disk order
1204	// TODO: ideally, this should be handled by the I/O scheduler
1205
1206	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1207	fDeletedTransaction = false;
1208
1209	bigtime_t start = system_time();
1210
1211	for (uint32 i = 0; i < fCount; i++) {
1212		status_t status = _WriteBlock(fBlocks[i]);
1213		if (status != B_OK) {
1214			// propagate to global error handling
1215			if (fStatus == B_OK)
1216				fStatus = status;
1217
1218			_UnmarkWriting(fBlocks[i]);
1219			fBlocks[i] = NULL;
1220				// This block will not be marked clean
1221		}
1222	}
1223
1224	bigtime_t finish = system_time();
1225
1226	if (canUnlock)
1227		mutex_lock(&fCache->lock);
1228
1229	if (fStatus == B_OK && fCount >= 8) {
1230		fCache->last_block_write = finish;
1231		fCache->last_block_write_duration = (fCache->last_block_write - start)
1232			/ fCount;
1233	}
1234
1235	for (uint32 i = 0; i < fCount; i++)
1236		_BlockDone(fBlocks[i], transaction);
1237
1238	fCount = 0;
1239	return fStatus;
1240}
1241
1242
1243/*!	Writes the specified \a block back to disk. It will always only write back
1244	the oldest change of the block if it is part of more than one transaction.
1245	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1246	delete transactions when they are no longer used, and \a deleteTransaction
1247	is \c true.
1248*/
1249/*static*/ status_t
1250BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1251{
1252	BlockWriter writer(cache);
1253
1254	writer.Add(block);
1255	return writer.Write();
1256}
1257
1258
1259void*
1260BlockWriter::_Data(cached_block* block) const
1261{
1262	return block->previous_transaction != NULL && block->original_data != NULL
1263		? block->original_data : block->current_data;
1264		// We first need to write back changes from previous transactions
1265}
1266
1267
1268status_t
1269BlockWriter::_WriteBlock(cached_block* block)
1270{
1271	ASSERT(block->busy_writing);
1272
1273	TRACE(("BlockWriter::_WriteBlock(block %" B_PRIdOFF ")\n", block->block_number));
1274	TB(Write(fCache, block));
1275	TB2(BlockData(fCache, block, "before write"));
1276
1277	size_t blockSize = fCache->block_size;
1278
1279	ssize_t written = write_pos(fCache->fd,
1280		block->block_number * blockSize, _Data(block), blockSize);
1281
1282	if (written != (ssize_t)blockSize) {
1283		TB(Error(fCache, block->block_number, "write failed", written));
1284		TRACE_ALWAYS(("could not write back block %" B_PRIdOFF " (%s)\n", block->block_number,
1285			strerror(errno)));
1286		if (written < 0)
1287			return errno;
1288
1289		return B_IO_ERROR;
1290	}
1291
1292	return B_OK;
1293}
1294
1295
1296void
1297BlockWriter::_BlockDone(cached_block* block,
1298	cache_transaction* transaction)
1299{
1300	if (block == NULL) {
1301		// An error occured when trying to write this block
1302		return;
1303	}
1304
1305	if (fCache->num_dirty_blocks > 0)
1306		fCache->num_dirty_blocks--;
1307
1308	if (_Data(block) == block->current_data)
1309		block->is_dirty = false;
1310
1311	_UnmarkWriting(block);
1312
1313	cache_transaction* previous = block->previous_transaction;
1314	if (previous != NULL) {
1315		previous->blocks.Remove(block);
1316		block->previous_transaction = NULL;
1317
1318		if (block->original_data != NULL && block->transaction == NULL) {
1319			// This block is not part of a transaction, so it does not need
1320			// its original pointer anymore.
1321			fCache->Free(block->original_data);
1322			block->original_data = NULL;
1323		}
1324
1325		// Has the previous transaction been finished with that write?
1326		if (--previous->num_blocks == 0) {
1327			TRACE(("cache transaction %" B_PRId32 " finished!\n", previous->id));
1328			T(Action("written", fCache, previous));
1329
1330			notify_transaction_listeners(fCache, previous,
1331				TRANSACTION_WRITTEN);
1332
1333			if (transaction != NULL) {
1334				// This function is called while iterating transaction_hash. We
1335				// use RemoveUnchecked so the iterator is still valid. A regular
1336				// Remove can trigger a resize of the hash table which would
1337				// result in the linked items in the table changing order.
1338				fCache->transaction_hash->RemoveUnchecked(transaction);
1339			} else
1340				fCache->transaction_hash->Remove(previous);
1341
1342			delete_transaction(fCache, previous);
1343			fDeletedTransaction = true;
1344		}
1345	}
1346	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1347		// the block is no longer used
1348		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1349		block->unused = true;
1350		fCache->unused_blocks.Add(block);
1351		fCache->unused_block_count++;
1352	}
1353
1354	TB2(BlockData(fCache, block, "after write"));
1355}
1356
1357
1358void
1359BlockWriter::_UnmarkWriting(cached_block* block)
1360{
1361	block->busy_writing = false;
1362	if (block->previous_transaction != NULL)
1363		block->previous_transaction->busy_writing_count--;
1364	fCache->busy_writing_count--;
1365
1366	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1367		|| block->busy_writing_waiters) {
1368		fCache->busy_writing_waiters = false;
1369		block->busy_writing_waiters = false;
1370		fCache->busy_writing_condition.NotifyAll();
1371	}
1372}
1373
1374
1375/*static*/ int
1376BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1377{
1378	cached_block* blockA = *(cached_block**)_blockA;
1379	cached_block* blockB = *(cached_block**)_blockB;
1380
1381	off_t diff = blockA->block_number - blockB->block_number;
1382	if (diff > 0)
1383		return 1;
1384
1385	return diff < 0 ? -1 : 0;
1386}
1387
1388
1389//	#pragma mark - block_cache
1390
1391
1392block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1393		bool readOnly)
1394	:
1395	hash(NULL),
1396	fd(_fd),
1397	max_blocks(numBlocks),
1398	block_size(blockSize),
1399	next_transaction_id(1),
1400	last_transaction(NULL),
1401	transaction_hash(NULL),
1402	buffer_cache(NULL),
1403	unused_block_count(0),
1404	busy_reading_count(0),
1405	busy_reading_waiters(false),
1406	busy_writing_count(0),
1407	busy_writing_waiters(0),
1408	last_block_write(0),
1409	last_block_write_duration(0),
1410	num_dirty_blocks(0),
1411	read_only(readOnly)
1412{
1413}
1414
1415
1416/*! Should be called with the cache's lock held. */
1417block_cache::~block_cache()
1418{
1419	unregister_low_resource_handler(&_LowMemoryHandler, this);
1420
1421	delete transaction_hash;
1422	delete hash;
1423
1424	delete_object_cache(buffer_cache);
1425
1426	mutex_destroy(&lock);
1427}
1428
1429
1430status_t
1431block_cache::Init()
1432{
1433	busy_reading_condition.Init(this, "cache block busy_reading");
1434	busy_writing_condition.Init(this, "cache block busy writing");
1435	condition_variable.Init(this, "cache transaction sync");
1436	mutex_init(&lock, "block cache");
1437
1438	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1439		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1440	if (buffer_cache == NULL)
1441		return B_NO_MEMORY;
1442
1443	hash = new BlockTable();
1444	if (hash == NULL || hash->Init(1024) != B_OK)
1445		return B_NO_MEMORY;
1446
1447	transaction_hash = new(std::nothrow) TransactionTable();
1448	if (transaction_hash == NULL || transaction_hash->Init(16) != B_OK)
1449		return B_NO_MEMORY;
1450
1451	return register_low_resource_handler(&_LowMemoryHandler, this,
1452		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1453			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1454}
1455
1456
1457void
1458block_cache::Free(void* buffer)
1459{
1460	if (buffer != NULL)
1461		object_cache_free(buffer_cache, buffer, 0);
1462}
1463
1464
1465void*
1466block_cache::Allocate()
1467{
1468	void* block = object_cache_alloc(buffer_cache, 0);
1469	if (block != NULL)
1470		return block;
1471
1472	// recycle existing before allocating a new one
1473	RemoveUnusedBlocks(100);
1474
1475	return object_cache_alloc(buffer_cache, 0);
1476}
1477
1478
1479void
1480block_cache::FreeBlock(cached_block* block)
1481{
1482	Free(block->current_data);
1483
1484	if (block->original_data != NULL || block->parent_data != NULL) {
1485		panic("block_cache::FreeBlock(): %" B_PRIdOFF ", original %p, parent %p\n",
1486			block->block_number, block->original_data, block->parent_data);
1487	}
1488
1489#if BLOCK_CACHE_DEBUG_CHANGED
1490	Free(block->compare);
1491#endif
1492
1493	object_cache_free(sBlockCache, block, 0);
1494}
1495
1496
1497/*! Allocates a new block for \a blockNumber, ready for use */
1498cached_block*
1499block_cache::NewBlock(off_t blockNumber)
1500{
1501	cached_block* block = NULL;
1502
1503	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1504			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1505		// recycle existing instead of allocating a new one
1506		block = _GetUnusedBlock();
1507	}
1508	if (block == NULL) {
1509		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1510		if (block != NULL) {
1511			block->current_data = Allocate();
1512			if (block->current_data == NULL) {
1513				object_cache_free(sBlockCache, block, 0);
1514				return NULL;
1515			}
1516		} else {
1517			TB(Error(this, blockNumber, "allocation failed"));
1518			dprintf("block allocation failed, unused list is %sempty.\n",
1519				unused_blocks.IsEmpty() ? "" : "not ");
1520
1521			// allocation failed, try to reuse an unused block
1522			block = _GetUnusedBlock();
1523			if (block == NULL) {
1524				TB(Error(this, blockNumber, "get unused failed"));
1525				FATAL(("could not allocate block!\n"));
1526				return NULL;
1527			}
1528		}
1529	}
1530
1531	block->block_number = blockNumber;
1532	block->ref_count = 0;
1533	block->last_accessed = 0;
1534	block->transaction_next = NULL;
1535	block->transaction = block->previous_transaction = NULL;
1536	block->original_data = NULL;
1537	block->parent_data = NULL;
1538	block->busy_reading = false;
1539	block->busy_writing = false;
1540	block->is_writing = false;
1541	block->is_dirty = false;
1542	block->unused = false;
1543	block->discard = false;
1544	block->busy_reading_waiters = false;
1545	block->busy_writing_waiters = false;
1546#if BLOCK_CACHE_DEBUG_CHANGED
1547	block->compare = NULL;
1548#endif
1549
1550	return block;
1551}
1552
1553
1554void
1555block_cache::FreeBlockParentData(cached_block* block)
1556{
1557	ASSERT(block->parent_data != NULL);
1558	if (block->parent_data != block->current_data)
1559		Free(block->parent_data);
1560	block->parent_data = NULL;
1561}
1562
1563
1564void
1565block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1566{
1567	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1568
1569	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1570			cached_block* block = iterator.Next();) {
1571		if (minSecondsOld >= block->LastAccess()) {
1572			// The list is sorted by last access
1573			break;
1574		}
1575		if (block->busy_reading || block->busy_writing)
1576			continue;
1577
1578		TB(Flush(this, block));
1579		TRACE(("  remove block %" B_PRIdOFF ", last accessed %" B_PRId32 "\n",
1580			block->block_number, block->last_accessed));
1581
1582		// this can only happen if no transactions are used
1583		if (block->is_dirty && !block->discard) {
1584			if (block->busy_writing)
1585				continue;
1586
1587			BlockWriter::WriteBlock(this, block);
1588		}
1589
1590		// remove block from lists
1591		iterator.Remove();
1592		unused_block_count--;
1593		RemoveBlock(block);
1594
1595		if (--count <= 0)
1596			break;
1597	}
1598}
1599
1600
1601void
1602block_cache::RemoveBlock(cached_block* block)
1603{
1604	hash->Remove(block);
1605	FreeBlock(block);
1606}
1607
1608
1609/*!	Discards the block from a transaction (this method must not be called
1610	for blocks not part of a transaction).
1611*/
1612void
1613block_cache::DiscardBlock(cached_block* block)
1614{
1615	ASSERT(block->discard);
1616	ASSERT(block->previous_transaction == NULL);
1617
1618	if (block->parent_data != NULL)
1619		FreeBlockParentData(block);
1620
1621	if (block->original_data != NULL) {
1622		Free(block->original_data);
1623		block->original_data = NULL;
1624	}
1625
1626	RemoveBlock(block);
1627}
1628
1629
1630void
1631block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1632{
1633	TRACE(("block_cache: low memory handler called with level %" B_PRId32 "\n", level));
1634
1635	// free some blocks according to the low memory state
1636	// (if there is enough memory left, we don't free any)
1637
1638	block_cache* cache = (block_cache*)data;
1639	if (cache->unused_block_count <= 1)
1640		return;
1641
1642	int32 free = 0;
1643	int32 secondsOld = 0;
1644	switch (level) {
1645		case B_NO_LOW_RESOURCE:
1646			return;
1647		case B_LOW_RESOURCE_NOTE:
1648			free = cache->unused_block_count / 4;
1649			secondsOld = 120;
1650			break;
1651		case B_LOW_RESOURCE_WARNING:
1652			free = cache->unused_block_count / 2;
1653			secondsOld = 10;
1654			break;
1655		case B_LOW_RESOURCE_CRITICAL:
1656			free = cache->unused_block_count - 1;
1657			secondsOld = 0;
1658			break;
1659	}
1660
1661	MutexLocker locker(&cache->lock);
1662
1663	if (!locker.IsLocked()) {
1664		// If our block_cache were deleted, it could be that we had
1665		// been called before that deletion went through, therefore,
1666		// acquiring its lock might fail.
1667		return;
1668	}
1669
1670#ifdef TRACE_BLOCK_CACHE
1671	uint32 oldUnused = cache->unused_block_count;
1672#endif
1673
1674	cache->RemoveUnusedBlocks(free, secondsOld);
1675
1676	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %" B_PRIu32 " -> %" B_PRIu32 "\n",
1677		cache, oldUnused, cache->unused_block_count));
1678}
1679
1680
1681cached_block*
1682block_cache::_GetUnusedBlock()
1683{
1684	TRACE(("block_cache: get unused block\n"));
1685
1686	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1687			cached_block* block = iterator.Next();) {
1688		TB(Flush(this, block, true));
1689		// this can only happen if no transactions are used
1690		if (block->is_dirty && !block->busy_writing && !block->discard)
1691			BlockWriter::WriteBlock(this, block);
1692
1693		// remove block from lists
1694		iterator.Remove();
1695		unused_block_count--;
1696		hash->Remove(block);
1697
1698		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1699		block->unused = false;
1700
1701		// TODO: see if compare data is handled correctly here!
1702#if BLOCK_CACHE_DEBUG_CHANGED
1703		if (block->compare != NULL)
1704			Free(block->compare);
1705#endif
1706		return block;
1707	}
1708
1709	return NULL;
1710}
1711
1712
1713//	#pragma mark - private block functions
1714
1715
1716/*!	Cache must be locked.
1717*/
1718static void
1719mark_block_busy_reading(block_cache* cache, cached_block* block)
1720{
1721	block->busy_reading = true;
1722	cache->busy_reading_count++;
1723}
1724
1725
1726/*!	Cache must be locked.
1727*/
1728static void
1729mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1730{
1731	block->busy_reading = false;
1732	cache->busy_reading_count--;
1733
1734	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1735		|| block->busy_reading_waiters) {
1736		cache->busy_reading_waiters = false;
1737		block->busy_reading_waiters = false;
1738		cache->busy_reading_condition.NotifyAll();
1739	}
1740}
1741
1742
1743/*!	Cache must be locked.
1744*/
1745static void
1746wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1747{
1748	while (block->busy_reading) {
1749		// wait for at least the specified block to be read in
1750		ConditionVariableEntry entry;
1751		cache->busy_reading_condition.Add(&entry);
1752		block->busy_reading_waiters = true;
1753
1754		mutex_unlock(&cache->lock);
1755
1756		entry.Wait();
1757
1758		mutex_lock(&cache->lock);
1759	}
1760}
1761
1762
1763/*!	Cache must be locked.
1764*/
1765static void
1766wait_for_busy_reading_blocks(block_cache* cache)
1767{
1768	while (cache->busy_reading_count != 0) {
1769		// wait for all blocks to be read in
1770		ConditionVariableEntry entry;
1771		cache->busy_reading_condition.Add(&entry);
1772		cache->busy_reading_waiters = true;
1773
1774		mutex_unlock(&cache->lock);
1775
1776		entry.Wait();
1777
1778		mutex_lock(&cache->lock);
1779	}
1780}
1781
1782
1783/*!	Cache must be locked.
1784*/
1785static void
1786wait_for_busy_writing_block(block_cache* cache, cached_block* block)
1787{
1788	while (block->busy_writing) {
1789		// wait for all blocks to be written back
1790		ConditionVariableEntry entry;
1791		cache->busy_writing_condition.Add(&entry);
1792		block->busy_writing_waiters = true;
1793
1794		mutex_unlock(&cache->lock);
1795
1796		entry.Wait();
1797
1798		mutex_lock(&cache->lock);
1799	}
1800}
1801
1802
1803/*!	Cache must be locked.
1804*/
1805static void
1806wait_for_busy_writing_blocks(block_cache* cache)
1807{
1808	while (cache->busy_writing_count != 0) {
1809		// wait for all blocks to be written back
1810		ConditionVariableEntry entry;
1811		cache->busy_writing_condition.Add(&entry);
1812		cache->busy_writing_waiters = true;
1813
1814		mutex_unlock(&cache->lock);
1815
1816		entry.Wait();
1817
1818		mutex_lock(&cache->lock);
1819	}
1820}
1821
1822
1823/*!	Removes a reference from the specified \a block. If this was the last
1824	reference, the block is moved into the unused list.
1825	In low memory situations, it will also free some blocks from that list,
1826	but not necessarily the \a block it just released.
1827*/
1828static void
1829put_cached_block(block_cache* cache, cached_block* block)
1830{
1831#if BLOCK_CACHE_DEBUG_CHANGED
1832	if (!block->is_dirty && block->compare != NULL
1833		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1834		dprintf("new block:\n");
1835		dump_block((const char*)block->current_data, 256, "  ");
1836		dprintf("unchanged block:\n");
1837		dump_block((const char*)block->compare, 256, "  ");
1838		BlockWriter::WriteBlock(cache, block);
1839		panic("block_cache: supposed to be clean block was changed!\n");
1840
1841		cache->Free(block->compare);
1842		block->compare = NULL;
1843	}
1844#endif
1845	TB(Put(cache, block));
1846
1847	if (block->ref_count < 1) {
1848		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1849		return;
1850	}
1851
1852	if (--block->ref_count == 0
1853		&& block->transaction == NULL && block->previous_transaction == NULL) {
1854		// This block is not used anymore, and not part of any transaction
1855		block->is_writing = false;
1856
1857		if (block->discard) {
1858			cache->RemoveBlock(block);
1859		} else {
1860			// put this block in the list of unused blocks
1861			ASSERT(!block->unused);
1862			block->unused = true;
1863
1864			ASSERT(block->original_data == NULL && block->parent_data == NULL);
1865			cache->unused_blocks.Add(block);
1866			cache->unused_block_count++;
1867		}
1868	}
1869}
1870
1871
1872static void
1873put_cached_block(block_cache* cache, off_t blockNumber)
1874{
1875	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1876		panic("put_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1877			blockNumber, cache->max_blocks - 1);
1878	}
1879
1880	cached_block* block = cache->hash->Lookup(blockNumber);
1881	if (block != NULL)
1882		put_cached_block(cache, block);
1883	else {
1884		TB(Error(cache, blockNumber, "put unknown"));
1885	}
1886}
1887
1888
1889/*!	Retrieves the block \a blockNumber from the hash table, if it's already
1890	there, or reads it from the disk.
1891	You need to have the cache locked when calling this function.
1892
1893	\param _allocated tells you whether or not a new block has been allocated
1894		to satisfy your request.
1895	\param readBlock if \c false, the block will not be read in case it was
1896		not already in the cache. The block you retrieve may contain random
1897		data. If \c true, the cache will be temporarily unlocked while the
1898		block is read in.
1899*/
1900static cached_block*
1901get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1902	bool readBlock = true)
1903{
1904	ASSERT_LOCKED_MUTEX(&cache->lock);
1905
1906	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1907		panic("get_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1908			blockNumber, cache->max_blocks - 1);
1909		return NULL;
1910	}
1911
1912retry:
1913	cached_block* block = cache->hash->Lookup(blockNumber);
1914	*_allocated = false;
1915
1916	if (block == NULL) {
1917		// put block into cache
1918		block = cache->NewBlock(blockNumber);
1919		if (block == NULL)
1920			return NULL;
1921
1922		cache->hash->Insert(block);
1923		*_allocated = true;
1924	} else if (block->busy_reading) {
1925		// The block is currently busy_reading - wait and try again later
1926		wait_for_busy_reading_block(cache, block);
1927		goto retry;
1928	}
1929
1930	if (block->unused) {
1931		//TRACE(("remove block %" B_PRIdOFF " from unused\n", blockNumber));
1932		block->unused = false;
1933		cache->unused_blocks.Remove(block);
1934		cache->unused_block_count--;
1935	}
1936
1937	if (*_allocated && readBlock) {
1938		// read block into cache
1939		int32 blockSize = cache->block_size;
1940
1941		mark_block_busy_reading(cache, block);
1942		mutex_unlock(&cache->lock);
1943
1944		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1945			block->current_data, blockSize);
1946
1947		mutex_lock(&cache->lock);
1948		if (bytesRead < blockSize) {
1949			cache->RemoveBlock(block);
1950			TB(Error(cache, blockNumber, "read failed", bytesRead));
1951
1952			TRACE_ALWAYS(("could not read block %" B_PRIdOFF ": bytesRead: %zd, error: %s\n",
1953				blockNumber, bytesRead, strerror(errno)));
1954			return NULL;
1955		}
1956		TB(Read(cache, block));
1957
1958		mark_block_unbusy_reading(cache, block);
1959	}
1960
1961	block->ref_count++;
1962	block->last_accessed = system_time() / 1000000L;
1963
1964	return block;
1965}
1966
1967
1968/*!	Returns the writable block data for the requested blockNumber.
1969	If \a cleared is true, the block is not read from disk; an empty block
1970	is returned.
1971
1972	This is the only method to insert a block into a transaction. It makes
1973	sure that the previous block contents are preserved in that case.
1974*/
1975static void*
1976get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
1977	off_t length, int32 transactionID, bool cleared)
1978{
1979	TRACE(("get_writable_cached_block(blockNumber = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
1980		blockNumber, transactionID));
1981
1982	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1983		panic("get_writable_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1984			blockNumber, cache->max_blocks - 1);
1985	}
1986
1987	bool allocated;
1988	cached_block* block = get_cached_block(cache, blockNumber, &allocated,
1989		!cleared);
1990	if (block == NULL)
1991		return NULL;
1992
1993	if (block->busy_writing)
1994		wait_for_busy_writing_block(cache, block);
1995
1996	block->discard = false;
1997
1998	// if there is no transaction support, we just return the current block
1999	if (transactionID == -1) {
2000		if (cleared) {
2001			mark_block_busy_reading(cache, block);
2002			mutex_unlock(&cache->lock);
2003
2004			memset(block->current_data, 0, cache->block_size);
2005
2006			mutex_lock(&cache->lock);
2007			mark_block_unbusy_reading(cache, block);
2008		}
2009
2010		block->is_writing = true;
2011
2012		if (!block->is_dirty) {
2013			cache->num_dirty_blocks++;
2014			block->is_dirty = true;
2015				// mark the block as dirty
2016		}
2017
2018		TB(Get(cache, block));
2019		return block->current_data;
2020	}
2021
2022	cache_transaction* transaction = block->transaction;
2023
2024	if (transaction != NULL && transaction->id != transactionID) {
2025		// TODO: we have to wait here until the other transaction is done.
2026		//	Maybe we should even panic, since we can't prevent any deadlocks.
2027		panic("get_writable_cached_block(): asked to get busy writable block "
2028			"(transaction %" B_PRId32 ")\n", block->transaction->id);
2029		put_cached_block(cache, block);
2030		return NULL;
2031	}
2032	if (transaction == NULL && transactionID != -1) {
2033		// get new transaction
2034		transaction = lookup_transaction(cache, transactionID);
2035		if (transaction == NULL) {
2036			panic("get_writable_cached_block(): invalid transaction %" B_PRId32 "!\n",
2037				transactionID);
2038			put_cached_block(cache, block);
2039			return NULL;
2040		}
2041		if (!transaction->open) {
2042			panic("get_writable_cached_block(): transaction already done!\n");
2043			put_cached_block(cache, block);
2044			return NULL;
2045		}
2046
2047		block->transaction = transaction;
2048
2049		// attach the block to the transaction block list
2050		block->transaction_next = transaction->first_block;
2051		transaction->first_block = block;
2052		transaction->num_blocks++;
2053	}
2054	if (transaction != NULL)
2055		transaction->last_used = system_time();
2056
2057	bool wasUnchanged = block->original_data == NULL
2058		|| block->previous_transaction != NULL;
2059
2060	if (!(allocated && cleared) && block->original_data == NULL) {
2061		// we already have data, so we need to preserve it
2062		block->original_data = cache->Allocate();
2063		if (block->original_data == NULL) {
2064			TB(Error(cache, blockNumber, "allocate original failed"));
2065			FATAL(("could not allocate original_data\n"));
2066			put_cached_block(cache, block);
2067			return NULL;
2068		}
2069
2070		mark_block_busy_reading(cache, block);
2071		mutex_unlock(&cache->lock);
2072
2073		memcpy(block->original_data, block->current_data, cache->block_size);
2074
2075		mutex_lock(&cache->lock);
2076		mark_block_unbusy_reading(cache, block);
2077	}
2078	if (block->parent_data == block->current_data) {
2079		// remember any previous contents for the parent transaction
2080		block->parent_data = cache->Allocate();
2081		if (block->parent_data == NULL) {
2082			// TODO: maybe we should just continue the current transaction in
2083			// this case...
2084			TB(Error(cache, blockNumber, "allocate parent failed"));
2085			FATAL(("could not allocate parent\n"));
2086			put_cached_block(cache, block);
2087			return NULL;
2088		}
2089
2090		mark_block_busy_reading(cache, block);
2091		mutex_unlock(&cache->lock);
2092
2093		memcpy(block->parent_data, block->current_data, cache->block_size);
2094
2095		mutex_lock(&cache->lock);
2096		mark_block_unbusy_reading(cache, block);
2097
2098		transaction->sub_num_blocks++;
2099	} else if (transaction != NULL && transaction->has_sub_transaction
2100		&& block->parent_data == NULL && wasUnchanged)
2101		transaction->sub_num_blocks++;
2102
2103	if (cleared) {
2104		mark_block_busy_reading(cache, block);
2105		mutex_unlock(&cache->lock);
2106
2107		memset(block->current_data, 0, cache->block_size);
2108
2109		mutex_lock(&cache->lock);
2110		mark_block_unbusy_reading(cache, block);
2111	}
2112
2113	block->is_dirty = true;
2114	TB(Get(cache, block));
2115	TB2(BlockData(cache, block, "get writable"));
2116
2117	return block->current_data;
2118}
2119
2120
2121#if DEBUG_BLOCK_CACHE
2122
2123
2124static void
2125dump_block(cached_block* block)
2126{
2127	kprintf("%08lx %9" B_PRIdOFF " %08lx %08lx %08lx %5" B_PRId32 " %6" B_PRId32
2128		" %c%c%c%c%c%c %08lx %08lx\n",
2129		(addr_t)block, block->block_number,
2130		(addr_t)block->current_data, (addr_t)block->original_data,
2131		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2132		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2133		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2134		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2135		(addr_t)block->transaction,
2136		(addr_t)block->previous_transaction);
2137}
2138
2139
2140static void
2141dump_block_long(cached_block* block)
2142{
2143	kprintf("BLOCK %p\n", block);
2144	kprintf(" current data:  %p\n", block->current_data);
2145	kprintf(" original data: %p\n", block->original_data);
2146	kprintf(" parent data:   %p\n", block->parent_data);
2147#if BLOCK_CACHE_DEBUG_CHANGED
2148	kprintf(" compare data:  %p\n", block->compare);
2149#endif
2150	kprintf(" ref_count:     %" B_PRId32 "\n", block->ref_count);
2151	kprintf(" accessed:      %" B_PRId32 "\n", block->LastAccess());
2152	kprintf(" flags:        ");
2153	if (block->busy_reading)
2154		kprintf(" busy_reading");
2155	if (block->busy_writing)
2156		kprintf(" busy_writing");
2157	if (block->is_writing)
2158		kprintf(" is-writing");
2159	if (block->is_dirty)
2160		kprintf(" is-dirty");
2161	if (block->unused)
2162		kprintf(" unused");
2163	if (block->discard)
2164		kprintf(" discard");
2165	kprintf("\n");
2166	if (block->transaction != NULL) {
2167		kprintf(" transaction:   %p (%" B_PRId32 ")\n", block->transaction,
2168			block->transaction->id);
2169		if (block->transaction_next != NULL) {
2170			kprintf(" next in transaction: %" B_PRIdOFF "\n",
2171				block->transaction_next->block_number);
2172		}
2173	}
2174	if (block->previous_transaction != NULL) {
2175		kprintf(" previous transaction: %p (%" B_PRId32 ")\n",
2176			block->previous_transaction,
2177			block->previous_transaction->id);
2178	}
2179
2180	set_debug_variable("_current", (addr_t)block->current_data);
2181	set_debug_variable("_original", (addr_t)block->original_data);
2182	set_debug_variable("_parent", (addr_t)block->parent_data);
2183}
2184
2185
2186static int
2187dump_cached_block(int argc, char** argv)
2188{
2189	if (argc != 2) {
2190		kprintf("usage: %s <block-address>\n", argv[0]);
2191		return 0;
2192	}
2193
2194	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2195	return 0;
2196}
2197
2198
2199static int
2200dump_cache(int argc, char** argv)
2201{
2202	bool showTransactions = false;
2203	bool showBlocks = false;
2204	int32 i = 1;
2205	while (argv[i] != NULL && argv[i][0] == '-') {
2206		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2207			switch (arg[0]) {
2208				case 'b':
2209					showBlocks = true;
2210					break;
2211				case 't':
2212					showTransactions = true;
2213					break;
2214				default:
2215					print_debugger_command_usage(argv[0]);
2216					return 0;
2217			}
2218		}
2219		i++;
2220	}
2221
2222	if (i >= argc) {
2223		print_debugger_command_usage(argv[0]);
2224		return 0;
2225	}
2226
2227	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2228	if (cache == NULL) {
2229		kprintf("invalid cache address\n");
2230		return 0;
2231	}
2232
2233	off_t blockNumber = -1;
2234	if (i + 1 < argc) {
2235		blockNumber = parse_expression(argv[i + 1]);
2236		cached_block* block = cache->hash->Lookup(blockNumber);
2237		if (block != NULL)
2238			dump_block_long(block);
2239		else
2240			kprintf("block %" B_PRIdOFF " not found\n", blockNumber);
2241		return 0;
2242	}
2243
2244	kprintf("BLOCK CACHE: %p\n", cache);
2245
2246	kprintf(" fd:           %d\n", cache->fd);
2247	kprintf(" max_blocks:   %" B_PRIdOFF "\n", cache->max_blocks);
2248	kprintf(" block_size:   %zu\n", cache->block_size);
2249	kprintf(" next_transaction_id: %" B_PRId32 "\n", cache->next_transaction_id);
2250	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2251	kprintf(" busy_reading: %" B_PRIu32 ", %s waiters\n", cache->busy_reading_count,
2252		cache->busy_reading_waiters ? "has" : "no");
2253	kprintf(" busy_writing: %" B_PRIu32 ", %s waiters\n", cache->busy_writing_count,
2254		cache->busy_writing_waiters ? "has" : "no");
2255
2256	if (!cache->pending_notifications.IsEmpty()) {
2257		kprintf(" pending notifications:\n");
2258
2259		NotificationList::Iterator iterator
2260			= cache->pending_notifications.GetIterator();
2261		while (iterator.HasNext()) {
2262			cache_notification* notification = iterator.Next();
2263
2264			kprintf("  %p %5" B_PRIx32 " %p - %p\n", notification,
2265				notification->events_pending, notification->hook,
2266				notification->data);
2267		}
2268	}
2269
2270	if (showTransactions) {
2271		kprintf(" transactions:\n");
2272		kprintf("address       id state  blocks  main   sub\n");
2273
2274		TransactionTable::Iterator iterator(cache->transaction_hash);
2275
2276		while (iterator.HasNext()) {
2277			cache_transaction* transaction = iterator.Next();
2278			kprintf("%p %5" B_PRId32 " %-7s %5" B_PRId32 " %5" B_PRId32 " %5"
2279				B_PRId32 "\n", transaction, transaction->id,
2280				transaction->open ? "open" : "closed",
2281				transaction->num_blocks, transaction->main_num_blocks,
2282				transaction->sub_num_blocks);
2283		}
2284	}
2285
2286	if (showBlocks) {
2287		kprintf(" blocks:\n");
2288		kprintf("address  block no. current  original parent    refs access "
2289			"flags transact prev. trans\n");
2290	}
2291
2292	uint32 referenced = 0;
2293	uint32 count = 0;
2294	uint32 dirty = 0;
2295	uint32 discarded = 0;
2296	BlockTable::Iterator iterator(cache->hash);
2297	while (iterator.HasNext()) {
2298		cached_block* block = iterator.Next();
2299		if (showBlocks)
2300			dump_block(block);
2301
2302		if (block->is_dirty)
2303			dirty++;
2304		if (block->discard)
2305			discarded++;
2306		if (block->ref_count)
2307			referenced++;
2308		count++;
2309	}
2310
2311	kprintf(" %" B_PRIu32 " blocks total, %" B_PRIu32 " dirty, %" B_PRIu32
2312		" discarded, %" B_PRIu32 " referenced, %" B_PRIu32 " busy, %" B_PRIu32
2313		" in unused.\n",
2314		count, dirty, discarded, referenced, cache->busy_reading_count,
2315		cache->unused_block_count);
2316	return 0;
2317}
2318
2319
2320static int
2321dump_transaction(int argc, char** argv)
2322{
2323	bool showBlocks = false;
2324	int i = 1;
2325	if (argc > 1 && !strcmp(argv[1], "-b")) {
2326		showBlocks = true;
2327		i++;
2328	}
2329
2330	if (argc - i < 1 || argc - i > 2) {
2331		print_debugger_command_usage(argv[0]);
2332		return 0;
2333	}
2334
2335	cache_transaction* transaction = NULL;
2336
2337	if (argc - i == 1) {
2338		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2339	} else {
2340		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2341		int32 id = parse_expression(argv[i + 1]);
2342		transaction = lookup_transaction(cache, id);
2343		if (transaction == NULL) {
2344			kprintf("No transaction with ID %" B_PRId32 " found.\n", id);
2345			return 0;
2346		}
2347	}
2348
2349	kprintf("TRANSACTION %p\n", transaction);
2350
2351	kprintf(" id:             %" B_PRId32 "\n", transaction->id);
2352	kprintf(" num block:      %" B_PRId32 "\n", transaction->num_blocks);
2353	kprintf(" main num block: %" B_PRId32 "\n", transaction->main_num_blocks);
2354	kprintf(" sub num block:  %" B_PRId32 "\n", transaction->sub_num_blocks);
2355	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2356	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2357	kprintf(" idle:           %" B_PRId64 " secs\n",
2358		(system_time() - transaction->last_used) / 1000000);
2359
2360	kprintf(" listeners:\n");
2361
2362	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2363	while (iterator.HasNext()) {
2364		cache_listener* listener = iterator.Next();
2365
2366		kprintf("  %p %5" B_PRIx32 " %p - %p\n", listener, listener->events_pending,
2367			listener->hook, listener->data);
2368	}
2369
2370	if (!showBlocks)
2371		return 0;
2372
2373	kprintf(" blocks:\n");
2374	kprintf("address  block no. current  original parent    refs access "
2375		"flags transact prev. trans\n");
2376
2377	cached_block* block = transaction->first_block;
2378	while (block != NULL) {
2379		dump_block(block);
2380		block = block->transaction_next;
2381	}
2382
2383	kprintf("--\n");
2384
2385	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2386	while (blockIterator.HasNext()) {
2387		block = blockIterator.Next();
2388		dump_block(block);
2389	}
2390
2391	return 0;
2392}
2393
2394
2395static int
2396dump_caches(int argc, char** argv)
2397{
2398	kprintf("Block caches:\n");
2399	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2400	while (i.HasNext()) {
2401		block_cache* cache = i.Next();
2402		if (cache == (block_cache*)&sMarkCache)
2403			continue;
2404
2405		kprintf("  %p\n", cache);
2406	}
2407
2408	return 0;
2409}
2410
2411
2412#if BLOCK_CACHE_BLOCK_TRACING >= 2
2413static int
2414dump_block_data(int argc, char** argv)
2415{
2416	using namespace BlockTracing;
2417
2418	// Determine which blocks to show
2419
2420	bool printStackTrace = true;
2421	uint32 which = 0;
2422	int32 i = 1;
2423	while (i < argc && argv[i][0] == '-') {
2424		char* arg = &argv[i][1];
2425		while (arg[0]) {
2426			switch (arg[0]) {
2427				case 'c':
2428					which |= BlockData::kCurrent;
2429					break;
2430				case 'p':
2431					which |= BlockData::kParent;
2432					break;
2433				case 'o':
2434					which |= BlockData::kOriginal;
2435					break;
2436
2437				default:
2438					kprintf("invalid block specifier (only o/c/p are "
2439						"allowed).\n");
2440					return 0;
2441			}
2442			arg++;
2443		}
2444
2445		i++;
2446	}
2447	if (which == 0)
2448		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2449
2450	if (i == argc) {
2451		print_debugger_command_usage(argv[0]);
2452		return 0;
2453	}
2454
2455	// Get the range of blocks to print
2456
2457	int64 from = parse_expression(argv[i]);
2458	int64 to = from;
2459	if (argc > i + 1)
2460		to = parse_expression(argv[i + 1]);
2461	if (to < from)
2462		to = from;
2463
2464	uint32 offset = 0;
2465	uint32 size = LONG_MAX;
2466	if (argc > i + 2)
2467		offset = parse_expression(argv[i + 2]);
2468	if (argc > i + 3)
2469		size = parse_expression(argv[i + 3]);
2470
2471	TraceEntryIterator iterator;
2472	iterator.MoveTo(from - 1);
2473
2474	static char sBuffer[1024];
2475	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2476
2477	while (TraceEntry* entry = iterator.Next()) {
2478		int32 index = iterator.Index();
2479		if (index > to)
2480			break;
2481
2482		Action* action = dynamic_cast<Action*>(entry);
2483		if (action != NULL) {
2484			out.Clear();
2485			out.DumpEntry(action);
2486			continue;
2487		}
2488
2489		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2490		if (blockData == NULL)
2491			continue;
2492
2493		out.Clear();
2494
2495		const char* dump = out.DumpEntry(entry);
2496		int length = strlen(dump);
2497		if (length > 0 && dump[length - 1] == '\n')
2498			length--;
2499
2500		kprintf("%5" B_PRId32 ". %.*s\n", index, length, dump);
2501
2502		if (printStackTrace) {
2503			out.Clear();
2504			entry->DumpStackTrace(out);
2505			if (out.Size() > 0)
2506				kputs(out.Buffer());
2507		}
2508
2509		blockData->DumpBlocks(which, offset, size);
2510	}
2511
2512	return 0;
2513}
2514#endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2515
2516
2517#endif	// DEBUG_BLOCK_CACHE
2518
2519
2520/*!	Traverses through the block_cache list, and returns one cache after the
2521	other. The cache returned is automatically locked when you get it, and
2522	unlocked with the next call to this function. Ignores caches that are in
2523	deletion state.
2524	Returns \c NULL when the end of the list is reached.
2525*/
2526static block_cache*
2527get_next_locked_block_cache(block_cache* last)
2528{
2529	MutexLocker _(sCachesLock);
2530
2531	block_cache* cache;
2532	if (last != NULL) {
2533		mutex_unlock(&last->lock);
2534
2535		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2536		sCaches.Remove((block_cache*)&sMarkCache);
2537	} else
2538		cache = sCaches.Head();
2539
2540	if (cache != NULL) {
2541		mutex_lock(&cache->lock);
2542		sCaches.Insert(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2543	}
2544
2545	return cache;
2546}
2547
2548
2549/*!	Background thread that continuously checks for pending notifications of
2550	all caches.
2551	Every two seconds, it will also write back up to 64 blocks per cache.
2552*/
2553static status_t
2554block_notifier_and_writer(void* /*data*/)
2555{
2556	const bigtime_t kDefaultTimeout = 2000000LL;
2557	bigtime_t timeout = kDefaultTimeout;
2558
2559	while (true) {
2560		bigtime_t start = system_time();
2561
2562		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2563			B_RELATIVE_TIMEOUT, timeout);
2564		if (status == B_OK) {
2565			flush_pending_notifications();
2566			timeout -= system_time() - start;
2567			continue;
2568		}
2569
2570		// Write 64 blocks of each block_cache roughly every 2 seconds,
2571		// potentially more or less depending on congestion and drive speeds
2572		// (usually much less.) We do not want to queue everything at once
2573		// because a future transaction might then get held up waiting for
2574		// a specific block to be written.
2575		timeout = kDefaultTimeout;
2576		size_t usedMemory;
2577		object_cache_get_usage(sBlockCache, &usedMemory);
2578
2579		block_cache* cache = NULL;
2580		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2581			// Give some breathing room: wait 2x the length of the potential
2582			// maximum block count-sized write between writes, and also skip
2583			// if there are more than 16 blocks currently being written.
2584			const bigtime_t next = cache->last_block_write
2585					+ cache->last_block_write_duration * 2 * 64;
2586			if (cache->busy_writing_count > 16 || system_time() < next) {
2587				if (cache->last_block_write_duration > 0) {
2588					timeout = min_c(timeout,
2589						cache->last_block_write_duration * 2 * 64);
2590				}
2591				continue;
2592			}
2593
2594			BlockWriter writer(cache, 64);
2595			bool hasMoreBlocks = false;
2596
2597			size_t cacheUsedMemory;
2598			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2599			usedMemory += cacheUsedMemory;
2600
2601			if (cache->num_dirty_blocks) {
2602				// This cache is not using transactions, we'll scan the blocks
2603				// directly
2604				BlockTable::Iterator iterator(cache->hash);
2605
2606				while (iterator.HasNext()) {
2607					cached_block* block = iterator.Next();
2608					if (block->CanBeWritten() && !writer.Add(block)) {
2609						hasMoreBlocks = true;
2610						break;
2611					}
2612				}
2613			} else {
2614				TransactionTable::Iterator iterator(cache->transaction_hash);
2615
2616				while (iterator.HasNext()) {
2617					cache_transaction* transaction = iterator.Next();
2618					if (transaction->open) {
2619						if (system_time() > transaction->last_used
2620								+ kTransactionIdleTime) {
2621							// Transaction is open but idle
2622							notify_transaction_listeners(cache, transaction,
2623								TRANSACTION_IDLE);
2624						}
2625						continue;
2626					}
2627
2628					bool hasLeftOvers;
2629						// we ignore this one
2630					if (!writer.Add(transaction, hasLeftOvers)) {
2631						hasMoreBlocks = true;
2632						break;
2633					}
2634				}
2635			}
2636
2637			writer.Write();
2638
2639			if (hasMoreBlocks && cache->last_block_write_duration > 0) {
2640				// There are probably still more blocks that we could write, so
2641				// see if we can decrease the timeout.
2642				timeout = min_c(timeout,
2643					cache->last_block_write_duration * 2 * 64);
2644			}
2645
2646			if ((block_cache_used_memory() / B_PAGE_SIZE)
2647					> vm_page_num_pages() / 2) {
2648				// Try to reduce memory usage to half of the available
2649				// RAM at maximum
2650				cache->RemoveUnusedBlocks(1000, 10);
2651			}
2652		}
2653
2654		MutexLocker _(sCachesMemoryUseLock);
2655		sUsedMemory = usedMemory;
2656	}
2657
2658	// never can get here
2659	return B_OK;
2660}
2661
2662
2663/*!	Notify function for wait_for_notifications(). */
2664static void
2665notify_sync(int32 transactionID, int32 event, void* _cache)
2666{
2667	block_cache* cache = (block_cache*)_cache;
2668
2669	cache->condition_variable.NotifyOne();
2670}
2671
2672
2673/*!	Must be called with the sCachesLock held. */
2674static bool
2675is_valid_cache(block_cache* cache)
2676{
2677	ASSERT_LOCKED_MUTEX(&sCachesLock);
2678
2679	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2680	while (iterator.HasNext()) {
2681		if (cache == iterator.Next())
2682			return true;
2683	}
2684
2685	return false;
2686}
2687
2688
2689/*!	Waits until all pending notifications are carried out.
2690	Safe to be called from the block writer/notifier thread.
2691	You must not hold the \a cache lock when calling this function.
2692*/
2693static void
2694wait_for_notifications(block_cache* cache)
2695{
2696	MutexLocker locker(sCachesLock);
2697
2698	if (find_thread(NULL) == sNotifierWriterThread) {
2699		// We're the notifier thread, don't wait, but flush all pending
2700		// notifications directly.
2701		if (is_valid_cache(cache))
2702			flush_pending_notifications(cache);
2703		return;
2704	}
2705
2706	// add sync notification
2707	cache_notification notification;
2708	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2709		cache);
2710
2711	ConditionVariableEntry entry;
2712	cache->condition_variable.Add(&entry);
2713
2714	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2715	locker.Unlock();
2716
2717	// wait for notification hook to be called
2718	entry.Wait();
2719}
2720
2721
2722status_t
2723block_cache_init(void)
2724{
2725	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2726		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2727	if (sBlockCache == NULL)
2728		return B_NO_MEMORY;
2729
2730	sCacheNotificationCache = create_object_cache("cache notifications",
2731		sizeof(cache_listener), 8, NULL, NULL, NULL);
2732	if (sCacheNotificationCache == NULL)
2733		return B_NO_MEMORY;
2734
2735	new (&sCaches) DoublyLinkedList<block_cache>;
2736		// manually call constructor
2737
2738	sEventSemaphore = create_sem(0, "block cache event");
2739	if (sEventSemaphore < B_OK)
2740		return sEventSemaphore;
2741
2742	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2743		"block notifier/writer", B_LOW_PRIORITY, NULL);
2744	if (sNotifierWriterThread >= B_OK)
2745		resume_thread(sNotifierWriterThread);
2746
2747#if DEBUG_BLOCK_CACHE
2748	add_debugger_command_etc("block_caches", &dump_caches,
2749		"dumps all block caches", "\n", 0);
2750	add_debugger_command_etc("block_cache", &dump_cache,
2751		"dumps a specific block cache",
2752		"[-bt] <cache-address> [block-number]\n"
2753		"  -t lists the transactions\n"
2754		"  -b lists all blocks\n", 0);
2755	add_debugger_command("cached_block", &dump_cached_block,
2756		"dumps the specified cached block");
2757	add_debugger_command_etc("transaction", &dump_transaction,
2758		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
2759		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
2760		"  -b lists all blocks that are part of this transaction\n", 0);
2761#	if BLOCK_CACHE_BLOCK_TRACING >= 2
2762	add_debugger_command_etc("block_cache_data", &dump_block_data,
2763		"dumps the data blocks logged for the actions",
2764		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
2765		"If no data specifier is used, all blocks are shown by default.\n"
2766		" -c       the current data is shown, if available.\n"
2767		" -p       the parent data is shown, if available.\n"
2768		" -o       the original data is shown, if available.\n"
2769		" <from>   first index of tracing entries to show.\n"
2770		" <to>     if given, the last entry. If not, only <from> is shown.\n"
2771		" <offset> the offset of the block data.\n"
2772		" <from>   the size of the block data that is dumped\n", 0);
2773#	endif
2774#endif	// DEBUG_BLOCK_CACHE
2775
2776	return B_OK;
2777}
2778
2779
2780size_t
2781block_cache_used_memory(void)
2782{
2783	MutexLocker _(sCachesMemoryUseLock);
2784	return sUsedMemory;
2785}
2786
2787
2788//	#pragma mark - public transaction API
2789
2790
2791int32
2792cache_start_transaction(void* _cache)
2793{
2794	block_cache* cache = (block_cache*)_cache;
2795	TransactionLocker locker(cache);
2796
2797	if (cache->last_transaction && cache->last_transaction->open) {
2798		panic("last transaction (%" B_PRId32 ") still open!\n",
2799			cache->last_transaction->id);
2800	}
2801
2802	cache_transaction* transaction = new(std::nothrow) cache_transaction;
2803	if (transaction == NULL)
2804		return B_NO_MEMORY;
2805
2806	transaction->id = atomic_add(&cache->next_transaction_id, 1);
2807	cache->last_transaction = transaction;
2808
2809	TRACE(("cache_start_transaction(): id %" B_PRId32 " started\n", transaction->id));
2810	T(Action("start", cache, transaction));
2811
2812	cache->transaction_hash->Insert(transaction);
2813
2814	return transaction->id;
2815}
2816
2817
2818status_t
2819cache_sync_transaction(void* _cache, int32 id)
2820{
2821	block_cache* cache = (block_cache*)_cache;
2822	bool hadBusy;
2823
2824	TRACE(("cache_sync_transaction(id %" B_PRId32 ")\n", id));
2825
2826	do {
2827		TransactionLocker locker(cache);
2828		hadBusy = false;
2829
2830		BlockWriter writer(cache);
2831		TransactionTable::Iterator iterator(cache->transaction_hash);
2832
2833		while (iterator.HasNext()) {
2834			// close all earlier transactions which haven't been closed yet
2835			cache_transaction* transaction = iterator.Next();
2836
2837			if (transaction->busy_writing_count != 0) {
2838				hadBusy = true;
2839				continue;
2840			}
2841			if (transaction->id <= id && !transaction->open) {
2842				// write back all of their remaining dirty blocks
2843				T(Action("sync", cache, transaction));
2844
2845				bool hasLeftOvers;
2846				writer.Add(transaction, hasLeftOvers);
2847
2848				if (hasLeftOvers) {
2849					// This transaction contains blocks that a previous
2850					// transaction is trying to write back in this write run
2851					hadBusy = true;
2852				}
2853			}
2854		}
2855
2856		status_t status = writer.Write();
2857		if (status != B_OK)
2858			return status;
2859	} while (hadBusy);
2860
2861	wait_for_notifications(cache);
2862		// make sure that all pending TRANSACTION_WRITTEN notifications
2863		// are handled after we return
2864	return B_OK;
2865}
2866
2867
2868status_t
2869cache_end_transaction(void* _cache, int32 id,
2870	transaction_notification_hook hook, void* data)
2871{
2872	block_cache* cache = (block_cache*)_cache;
2873	TransactionLocker locker(cache);
2874
2875	TRACE(("cache_end_transaction(id = %" B_PRId32 ")\n", id));
2876
2877	cache_transaction* transaction = lookup_transaction(cache, id);
2878	if (transaction == NULL) {
2879		panic("cache_end_transaction(): invalid transaction ID\n");
2880		return B_BAD_VALUE;
2881	}
2882
2883	// Write back all pending transaction blocks
2884	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2885	if (status != B_OK)
2886		return status;
2887
2888	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2889
2890	if (hook != NULL
2891		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
2892			hook, data) != B_OK) {
2893		return B_NO_MEMORY;
2894	}
2895
2896	T(Action("end", cache, transaction));
2897
2898	// iterate through all blocks and free the unchanged original contents
2899
2900	cached_block* next;
2901	for (cached_block* block = transaction->first_block; block != NULL;
2902			block = next) {
2903		next = block->transaction_next;
2904		ASSERT(block->previous_transaction == NULL);
2905
2906		if (block->discard) {
2907			// This block has been discarded in the transaction
2908			cache->DiscardBlock(block);
2909			transaction->num_blocks--;
2910			continue;
2911		}
2912
2913		if (block->original_data != NULL) {
2914			cache->Free(block->original_data);
2915			block->original_data = NULL;
2916		}
2917		if (block->parent_data != NULL) {
2918			ASSERT(transaction->has_sub_transaction);
2919			cache->FreeBlockParentData(block);
2920		}
2921
2922		// move the block to the previous transaction list
2923		transaction->blocks.Add(block);
2924
2925		block->previous_transaction = transaction;
2926		block->transaction_next = NULL;
2927		block->transaction = NULL;
2928	}
2929
2930	transaction->open = false;
2931	return B_OK;
2932}
2933
2934
2935status_t
2936cache_abort_transaction(void* _cache, int32 id)
2937{
2938	block_cache* cache = (block_cache*)_cache;
2939	TransactionLocker locker(cache);
2940
2941	TRACE(("cache_abort_transaction(id = %" B_PRId32 ")\n", id));
2942
2943	cache_transaction* transaction = lookup_transaction(cache, id);
2944	if (transaction == NULL) {
2945		panic("cache_abort_transaction(): invalid transaction ID\n");
2946		return B_BAD_VALUE;
2947	}
2948
2949	T(Abort(cache, transaction));
2950	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2951
2952	// iterate through all blocks and restore their original contents
2953
2954	cached_block* block = transaction->first_block;
2955	cached_block* next;
2956	for (; block != NULL; block = next) {
2957		next = block->transaction_next;
2958
2959		if (block->original_data != NULL) {
2960			TRACE(("cache_abort_transaction(id = %" B_PRId32 "): restored contents of "
2961				"block %" B_PRIdOFF "\n", transaction->id, block->block_number));
2962			memcpy(block->current_data, block->original_data,
2963				cache->block_size);
2964			cache->Free(block->original_data);
2965			block->original_data = NULL;
2966		}
2967		if (transaction->has_sub_transaction && block->parent_data != NULL)
2968			cache->FreeBlockParentData(block);
2969
2970		block->transaction_next = NULL;
2971		block->transaction = NULL;
2972		block->discard = false;
2973		if (block->previous_transaction == NULL)
2974			block->is_dirty = false;
2975	}
2976
2977	cache->transaction_hash->Remove(transaction);
2978	delete_transaction(cache, transaction);
2979	return B_OK;
2980}
2981
2982
2983/*!	Acknowledges the current parent transaction, and starts a new transaction
2984	from its sub transaction.
2985	The new transaction also gets a new transaction ID.
2986*/
2987int32
2988cache_detach_sub_transaction(void* _cache, int32 id,
2989	transaction_notification_hook hook, void* data)
2990{
2991	block_cache* cache = (block_cache*)_cache;
2992	TransactionLocker locker(cache);
2993
2994	TRACE(("cache_detach_sub_transaction(id = %" B_PRId32 ")\n", id));
2995
2996	cache_transaction* transaction = lookup_transaction(cache, id);
2997	if (transaction == NULL) {
2998		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
2999		return B_BAD_VALUE;
3000	}
3001	if (!transaction->has_sub_transaction)
3002		return B_BAD_VALUE;
3003
3004	// iterate through all blocks and free the unchanged original contents
3005
3006	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3007	if (status != B_OK)
3008		return status;
3009
3010	// create a new transaction for the sub transaction
3011	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
3012	if (newTransaction == NULL)
3013		return B_NO_MEMORY;
3014
3015	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
3016	T(Detach(cache, transaction, newTransaction));
3017
3018	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3019
3020	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
3021			data) != B_OK) {
3022		delete newTransaction;
3023		return B_NO_MEMORY;
3024	}
3025
3026	cached_block* last = NULL;
3027	cached_block* next;
3028	for (cached_block* block = transaction->first_block; block != NULL;
3029			block = next) {
3030		next = block->transaction_next;
3031		ASSERT(block->previous_transaction == NULL);
3032
3033		if (block->discard) {
3034			cache->DiscardBlock(block);
3035			transaction->main_num_blocks--;
3036			continue;
3037		}
3038
3039		if (block->parent_data != NULL) {
3040			// The block changed in the parent - free the original data, since
3041			// they will be replaced by what is in current.
3042			ASSERT(block->original_data != NULL);
3043			cache->Free(block->original_data);
3044
3045			if (block->parent_data != block->current_data) {
3046				// The block had been changed in both transactions
3047				block->original_data = block->parent_data;
3048			} else {
3049				// The block has only been changed in the parent
3050				block->original_data = NULL;
3051			}
3052
3053			// move the block to the previous transaction list
3054			transaction->blocks.Add(block);
3055			block->previous_transaction = transaction;
3056			block->parent_data = NULL;
3057		}
3058
3059		if (block->original_data != NULL) {
3060			// This block had been changed in the current sub transaction,
3061			// we need to move this block over to the new transaction.
3062			ASSERT(block->parent_data == NULL);
3063
3064			if (last == NULL)
3065				newTransaction->first_block = block;
3066			else
3067				last->transaction_next = block;
3068
3069			block->transaction = newTransaction;
3070			last = block;
3071		} else
3072			block->transaction = NULL;
3073
3074		block->transaction_next = NULL;
3075	}
3076
3077	newTransaction->num_blocks = transaction->sub_num_blocks;
3078
3079	transaction->open = false;
3080	transaction->has_sub_transaction = false;
3081	transaction->num_blocks = transaction->main_num_blocks;
3082	transaction->sub_num_blocks = 0;
3083
3084	cache->transaction_hash->Insert(newTransaction);
3085	cache->last_transaction = newTransaction;
3086
3087	return newTransaction->id;
3088}
3089
3090
3091status_t
3092cache_abort_sub_transaction(void* _cache, int32 id)
3093{
3094	block_cache* cache = (block_cache*)_cache;
3095	TransactionLocker locker(cache);
3096
3097	TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 ")\n", id));
3098
3099	cache_transaction* transaction = lookup_transaction(cache, id);
3100	if (transaction == NULL) {
3101		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3102		return B_BAD_VALUE;
3103	}
3104	if (!transaction->has_sub_transaction)
3105		return B_BAD_VALUE;
3106
3107	T(Abort(cache, transaction));
3108	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3109
3110	// revert all changes back to the version of the parent
3111
3112	cached_block* block = transaction->first_block;
3113	cached_block* last = NULL;
3114	cached_block* next;
3115	for (; block != NULL; block = next) {
3116		next = block->transaction_next;
3117
3118		if (block->parent_data == NULL) {
3119			// The parent transaction didn't change the block, but the sub
3120			// transaction did - we need to revert to the original data.
3121			// The block is no longer part of the transaction
3122			if (block->original_data != NULL) {
3123				// The block might not have original data if was empty
3124				memcpy(block->current_data, block->original_data,
3125					cache->block_size);
3126			}
3127
3128			if (last != NULL)
3129				last->transaction_next = next;
3130			else
3131				transaction->first_block = next;
3132
3133			block->transaction_next = NULL;
3134			block->transaction = NULL;
3135			transaction->num_blocks--;
3136
3137			if (block->previous_transaction == NULL) {
3138				cache->Free(block->original_data);
3139				block->original_data = NULL;
3140				block->is_dirty = false;
3141
3142				if (block->ref_count == 0) {
3143					// Move the block into the unused list if possible
3144					block->unused = true;
3145					cache->unused_blocks.Add(block);
3146					cache->unused_block_count++;
3147				}
3148			}
3149		} else {
3150			if (block->parent_data != block->current_data) {
3151				// The block has been changed and must be restored - the block
3152				// is still dirty and part of the transaction
3153				TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 "): "
3154					"restored contents of block %" B_PRIdOFF "\n",
3155					transaction->id, block->block_number));
3156				memcpy(block->current_data, block->parent_data,
3157					cache->block_size);
3158				cache->Free(block->parent_data);
3159				// The block stays dirty
3160			}
3161			block->parent_data = NULL;
3162			last = block;
3163		}
3164
3165		block->discard = false;
3166	}
3167
3168	// all subsequent changes will go into the main transaction
3169	transaction->has_sub_transaction = false;
3170	transaction->sub_num_blocks = 0;
3171
3172	return B_OK;
3173}
3174
3175
3176status_t
3177cache_start_sub_transaction(void* _cache, int32 id)
3178{
3179	block_cache* cache = (block_cache*)_cache;
3180	TransactionLocker locker(cache);
3181
3182	TRACE(("cache_start_sub_transaction(id = %" B_PRId32 ")\n", id));
3183
3184	cache_transaction* transaction = lookup_transaction(cache, id);
3185	if (transaction == NULL) {
3186		panic("cache_start_sub_transaction(): invalid transaction ID %" B_PRId32 "\n",
3187			id);
3188		return B_BAD_VALUE;
3189	}
3190
3191	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3192
3193	// move all changed blocks up to the parent
3194
3195	cached_block* block = transaction->first_block;
3196	cached_block* next;
3197	for (; block != NULL; block = next) {
3198		next = block->transaction_next;
3199
3200		if (block->parent_data != NULL) {
3201			// There already is an older sub transaction - we acknowledge
3202			// its changes and move its blocks up to the parent
3203			ASSERT(transaction->has_sub_transaction);
3204			cache->FreeBlockParentData(block);
3205		}
3206		if (block->discard) {
3207			// This block has been discarded in the parent transaction.
3208			// Just throw away any changes made in this transaction, so that
3209			// it can still be reverted to its original contents if needed
3210			ASSERT(block->previous_transaction == NULL);
3211			if (block->original_data != NULL) {
3212				memcpy(block->current_data, block->original_data,
3213					cache->block_size);
3214				block->original_data = NULL;
3215			}
3216			continue;
3217		}
3218
3219		// we "allocate" the parent data lazily, that means, we don't copy
3220		// the data (and allocate memory for it) until we need to
3221		block->parent_data = block->current_data;
3222	}
3223
3224	// all subsequent changes will go into the sub transaction
3225	transaction->has_sub_transaction = true;
3226	transaction->main_num_blocks = transaction->num_blocks;
3227	transaction->sub_num_blocks = 0;
3228	T(Action("start-sub", cache, transaction));
3229
3230	return B_OK;
3231}
3232
3233
3234/*!	Adds a transaction listener that gets notified when the transaction
3235	is ended, aborted, written, or idle as specified by \a events.
3236	The listener gets automatically removed when the transaction ends.
3237*/
3238status_t
3239cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3240	transaction_notification_hook hook, void* data)
3241{
3242	block_cache* cache = (block_cache*)_cache;
3243	TransactionLocker locker(cache);
3244
3245	cache_transaction* transaction = lookup_transaction(cache, id);
3246	if (transaction == NULL)
3247		return B_BAD_VALUE;
3248
3249	return add_transaction_listener(cache, transaction, events, hook, data);
3250}
3251
3252
3253status_t
3254cache_remove_transaction_listener(void* _cache, int32 id,
3255	transaction_notification_hook hookFunction, void* data)
3256{
3257	block_cache* cache = (block_cache*)_cache;
3258	TransactionLocker locker(cache);
3259
3260	cache_transaction* transaction = lookup_transaction(cache, id);
3261	if (transaction == NULL)
3262		return B_BAD_VALUE;
3263
3264	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3265	while (iterator.HasNext()) {
3266		cache_listener* listener = iterator.Next();
3267		if (listener->data == data && listener->hook == hookFunction) {
3268			iterator.Remove();
3269
3270			if (listener->events_pending != 0) {
3271				MutexLocker _(sNotificationsLock);
3272				if (listener->events_pending != 0)
3273					cache->pending_notifications.Remove(listener);
3274			}
3275			delete listener;
3276			return B_OK;
3277		}
3278	}
3279
3280	return B_ENTRY_NOT_FOUND;
3281}
3282
3283
3284status_t
3285cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3286	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3287{
3288	cached_block* block = (cached_block*)*_cookie;
3289	block_cache* cache = (block_cache*)_cache;
3290	TransactionLocker locker(cache);
3291
3292	cache_transaction* transaction = lookup_transaction(cache, id);
3293	if (transaction == NULL || !transaction->open)
3294		return B_BAD_VALUE;
3295
3296	if (block == NULL)
3297		block = transaction->first_block;
3298	else
3299		block = block->transaction_next;
3300
3301	if (transaction->has_sub_transaction) {
3302		if (mainOnly) {
3303			// find next block that the parent changed
3304			while (block != NULL && block->parent_data == NULL)
3305				block = block->transaction_next;
3306		} else {
3307			// find next non-discarded block
3308			while (block != NULL && block->discard)
3309				block = block->transaction_next;
3310		}
3311	}
3312
3313	if (block == NULL)
3314		return B_ENTRY_NOT_FOUND;
3315
3316	if (_blockNumber)
3317		*_blockNumber = block->block_number;
3318	if (_data)
3319		*_data = mainOnly ? block->parent_data : block->current_data;
3320	if (_unchangedData)
3321		*_unchangedData = block->original_data;
3322
3323	*_cookie = (addr_t)block;
3324	return B_OK;
3325}
3326
3327
3328int32
3329cache_blocks_in_transaction(void* _cache, int32 id)
3330{
3331	block_cache* cache = (block_cache*)_cache;
3332	TransactionLocker locker(cache);
3333
3334	cache_transaction* transaction = lookup_transaction(cache, id);
3335	if (transaction == NULL)
3336		return B_BAD_VALUE;
3337
3338	return transaction->num_blocks;
3339}
3340
3341
3342/*!	Returns the number of blocks that are part of the main transaction. If this
3343	transaction does not have a sub transaction yet, this is the same value as
3344	cache_blocks_in_transaction() would return.
3345*/
3346int32
3347cache_blocks_in_main_transaction(void* _cache, int32 id)
3348{
3349	block_cache* cache = (block_cache*)_cache;
3350	TransactionLocker locker(cache);
3351
3352	cache_transaction* transaction = lookup_transaction(cache, id);
3353	if (transaction == NULL)
3354		return B_BAD_VALUE;
3355
3356	if (transaction->has_sub_transaction)
3357		return transaction->main_num_blocks;
3358
3359	return transaction->num_blocks;
3360}
3361
3362
3363int32
3364cache_blocks_in_sub_transaction(void* _cache, int32 id)
3365{
3366	block_cache* cache = (block_cache*)_cache;
3367	TransactionLocker locker(cache);
3368
3369	cache_transaction* transaction = lookup_transaction(cache, id);
3370	if (transaction == NULL)
3371		return B_BAD_VALUE;
3372
3373	return transaction->sub_num_blocks;
3374}
3375
3376
3377/*!	Check if block is in transaction
3378*/
3379bool
3380cache_has_block_in_transaction(void* _cache, int32 id, off_t blockNumber)
3381{
3382	block_cache* cache = (block_cache*)_cache;
3383	TransactionLocker locker(cache);
3384
3385	cached_block* block = cache->hash->Lookup(blockNumber);
3386
3387	return (block != NULL && block->transaction != NULL
3388		&& block->transaction->id == id);
3389}
3390
3391
3392//	#pragma mark - public block cache API
3393
3394
3395void
3396block_cache_delete(void* _cache, bool allowWrites)
3397{
3398	block_cache* cache = (block_cache*)_cache;
3399
3400	if (allowWrites)
3401		block_cache_sync(cache);
3402
3403	mutex_lock(&sCachesLock);
3404	sCaches.Remove(cache);
3405	mutex_unlock(&sCachesLock);
3406
3407	mutex_lock(&cache->lock);
3408
3409	// wait for all blocks to become unbusy
3410	wait_for_busy_reading_blocks(cache);
3411	wait_for_busy_writing_blocks(cache);
3412
3413	// free all blocks
3414
3415	cached_block* block = cache->hash->Clear(true);
3416	while (block != NULL) {
3417		cached_block* next = block->next;
3418		cache->FreeBlock(block);
3419		block = next;
3420	}
3421
3422	// free all transactions (they will all be aborted)
3423
3424	cache_transaction* transaction = cache->transaction_hash->Clear(true);
3425	while (transaction != NULL) {
3426		cache_transaction* next = transaction->next;
3427		delete transaction;
3428		transaction = next;
3429	}
3430
3431	delete cache;
3432}
3433
3434
3435void*
3436block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3437{
3438	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3439		readOnly);
3440	if (cache == NULL)
3441		return NULL;
3442
3443	if (cache->Init() != B_OK) {
3444		delete cache;
3445		return NULL;
3446	}
3447
3448	MutexLocker _(sCachesLock);
3449	sCaches.Add(cache);
3450
3451	return cache;
3452}
3453
3454
3455status_t
3456block_cache_sync(void* _cache)
3457{
3458	block_cache* cache = (block_cache*)_cache;
3459
3460	// We will sync all dirty blocks to disk that have a completed
3461	// transaction or no transaction only
3462
3463	MutexLocker locker(&cache->lock);
3464
3465	BlockWriter writer(cache);
3466	BlockTable::Iterator iterator(cache->hash);
3467
3468	while (iterator.HasNext()) {
3469		cached_block* block = iterator.Next();
3470		if (block->CanBeWritten())
3471			writer.Add(block);
3472	}
3473
3474	status_t status = writer.Write();
3475
3476	locker.Unlock();
3477
3478	wait_for_notifications(cache);
3479		// make sure that all pending TRANSACTION_WRITTEN notifications
3480		// are handled after we return
3481	return status;
3482}
3483
3484
3485status_t
3486block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3487{
3488	block_cache* cache = (block_cache*)_cache;
3489
3490	// We will sync all dirty blocks to disk that have a completed
3491	// transaction or no transaction only
3492
3493	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3494		panic("block_cache_sync_etc: invalid block number %" B_PRIdOFF
3495			" (max %" B_PRIdOFF ")",
3496			blockNumber, cache->max_blocks - 1);
3497		return B_BAD_VALUE;
3498	}
3499
3500	MutexLocker locker(&cache->lock);
3501	BlockWriter writer(cache);
3502
3503	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3504		cached_block* block = cache->hash->Lookup(blockNumber);
3505		if (block == NULL)
3506			continue;
3507
3508		if (block->CanBeWritten())
3509			writer.Add(block);
3510	}
3511
3512	status_t status = writer.Write();
3513
3514	locker.Unlock();
3515
3516	wait_for_notifications(cache);
3517		// make sure that all pending TRANSACTION_WRITTEN notifications
3518		// are handled after we return
3519	return status;
3520}
3521
3522
3523/*!	Discards a block from the current transaction or from the cache.
3524	You have to call this function when you no longer use a block, ie. when it
3525	might be reclaimed by the file cache in order to make sure they won't
3526	interfere.
3527*/
3528void
3529block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3530{
3531	// TODO: this could be a nice place to issue the ATA trim command
3532	block_cache* cache = (block_cache*)_cache;
3533	TransactionLocker locker(cache);
3534
3535	BlockWriter writer(cache);
3536
3537	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3538		cached_block* block = cache->hash->Lookup(blockNumber);
3539		if (block != NULL && block->previous_transaction != NULL)
3540			writer.Add(block);
3541	}
3542
3543	writer.Write();
3544		// TODO: this can fail, too!
3545
3546	blockNumber -= numBlocks;
3547		// reset blockNumber to its original value
3548
3549	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3550		cached_block* block = cache->hash->Lookup(blockNumber);
3551		if (block == NULL)
3552			continue;
3553
3554		ASSERT(block->previous_transaction == NULL);
3555
3556		if (block->unused) {
3557			cache->unused_blocks.Remove(block);
3558			cache->unused_block_count--;
3559			cache->RemoveBlock(block);
3560		} else {
3561			if (block->transaction != NULL && block->parent_data != NULL
3562				&& block->parent_data != block->current_data) {
3563				panic("Discarded block %" B_PRIdOFF " has already been changed in this "
3564					"transaction!", blockNumber);
3565			}
3566
3567			// mark it as discarded (in the current transaction only, if any)
3568			block->discard = true;
3569		}
3570	}
3571}
3572
3573
3574status_t
3575block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3576{
3577	block_cache* cache = (block_cache*)_cache;
3578	MutexLocker locker(&cache->lock);
3579
3580	if (cache->read_only) {
3581		panic("tried to make block writable on a read-only cache!");
3582		return B_ERROR;
3583	}
3584
3585	// TODO: this can be done better!
3586	void* block = get_writable_cached_block(cache, blockNumber,
3587		blockNumber, 1, transaction, false);
3588	if (block != NULL) {
3589		put_cached_block((block_cache*)_cache, blockNumber);
3590		return B_OK;
3591	}
3592
3593	return B_ERROR;
3594}
3595
3596
3597void*
3598block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
3599	off_t length, int32 transaction)
3600{
3601	block_cache* cache = (block_cache*)_cache;
3602	MutexLocker locker(&cache->lock);
3603
3604	TRACE(("block_cache_get_writable_etc(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3605		blockNumber, transaction));
3606	if (cache->read_only)
3607		panic("tried to get writable block on a read-only cache!");
3608
3609	return get_writable_cached_block(cache, blockNumber, base, length,
3610		transaction, false);
3611}
3612
3613
3614void*
3615block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3616{
3617	return block_cache_get_writable_etc(_cache, blockNumber,
3618		blockNumber, 1, transaction);
3619}
3620
3621
3622void*
3623block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3624{
3625	block_cache* cache = (block_cache*)_cache;
3626	MutexLocker locker(&cache->lock);
3627
3628	TRACE(("block_cache_get_empty(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3629		blockNumber, transaction));
3630	if (cache->read_only)
3631		panic("tried to get empty writable block on a read-only cache!");
3632
3633	return get_writable_cached_block((block_cache*)_cache, blockNumber,
3634		blockNumber, 1, transaction, true);
3635}
3636
3637
3638const void*
3639block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length)
3640{
3641	block_cache* cache = (block_cache*)_cache;
3642	MutexLocker locker(&cache->lock);
3643	bool allocated;
3644
3645	cached_block* block = get_cached_block(cache, blockNumber, &allocated);
3646	if (block == NULL)
3647		return NULL;
3648
3649#if BLOCK_CACHE_DEBUG_CHANGED
3650	if (block->compare == NULL)
3651		block->compare = cache->Allocate();
3652	if (block->compare != NULL)
3653		memcpy(block->compare, block->current_data, cache->block_size);
3654#endif
3655	TB(Get(cache, block));
3656
3657	return block->current_data;
3658}
3659
3660
3661const void*
3662block_cache_get(void* _cache, off_t blockNumber)
3663{
3664	return block_cache_get_etc(_cache, blockNumber, blockNumber, 1);
3665}
3666
3667
3668/*!	Changes the internal status of a writable block to \a dirty. This can be
3669	helpful in case you realize you don't need to change that block anymore
3670	for whatever reason.
3671
3672	Note, you must only use this function on blocks that were acquired
3673	writable!
3674*/
3675status_t
3676block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3677	int32 transaction)
3678{
3679	block_cache* cache = (block_cache*)_cache;
3680	MutexLocker locker(&cache->lock);
3681
3682	cached_block* block = cache->hash->Lookup(blockNumber);
3683	if (block == NULL)
3684		return B_BAD_VALUE;
3685	if (block->is_dirty == dirty) {
3686		// there is nothing to do for us
3687		return B_OK;
3688	}
3689
3690	// TODO: not yet implemented
3691	if (dirty)
3692		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3693
3694	return B_OK;
3695}
3696
3697
3698void
3699block_cache_put(void* _cache, off_t blockNumber)
3700{
3701	block_cache* cache = (block_cache*)_cache;
3702	MutexLocker locker(&cache->lock);
3703
3704	put_cached_block(cache, blockNumber);
3705}
3706
3707