1/*
2 * Copyright 2001-2017, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 */
5
6
7//! Transaction and logging
8
9
10#include "Journal.h"
11
12#include "Debug.h"
13#include "Inode.h"
14
15
16struct run_array {
17	int32		count;
18	int32		max_runs;
19	block_run	runs[0];
20
21	void Init(int32 blockSize);
22	void Insert(block_run& run);
23
24	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
25	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
26		// that -1 accounts for an off-by-one error in Be's BFS implementation
27	const block_run& RunAt(int32 i) const { return runs[i]; }
28
29	static int32 MaxRuns(int32 blockSize);
30
31private:
32	static int _Compare(block_run& a, block_run& b);
33	int32 _FindInsertionIndex(block_run& run);
34};
35
36class RunArrays {
37public:
38							RunArrays(Journal* journal);
39							~RunArrays();
40
41			status_t		Insert(off_t blockNumber);
42
43			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
44			int32			CountArrays() const { return fArrays.CountItems(); }
45
46			uint32			CountBlocks() const { return fBlockCount; }
47			uint32			LogEntryLength() const
48								{ return CountBlocks() + CountArrays(); }
49
50			int32			MaxArrayLength();
51
52private:
53			status_t		_AddArray();
54			bool			_ContainsRun(block_run& run);
55			bool			_AddRun(block_run& run);
56
57			Journal*		fJournal;
58			uint32			fBlockCount;
59			Stack<run_array*> fArrays;
60			run_array*		fLastArray;
61};
62
63class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
64public:
65							LogEntry(Journal* journal, uint32 logStart,
66								uint32 length);
67							~LogEntry();
68
69			uint32			Start() const { return fStart; }
70			uint32			Length() const { return fLength; }
71
72#ifdef BFS_DEBUGGER_COMMANDS
73			void			SetTransactionID(int32 id) { fTransactionID = id; }
74			int32			TransactionID() const { return fTransactionID; }
75#endif
76
77			Journal*		GetJournal() { return fJournal; }
78
79private:
80			Journal*		fJournal;
81			uint32			fStart;
82			uint32			fLength;
83#ifdef BFS_DEBUGGER_COMMANDS
84			int32			fTransactionID;
85#endif
86};
87
88
89#if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
90namespace BFSJournalTracing {
91
92class LogEntry : public AbstractTraceEntry {
93public:
94	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
95		:
96		fEntry(entry),
97#ifdef BFS_DEBUGGER_COMMANDS
98		fTransactionID(entry->TransactionID()),
99#endif
100		fStart(entry->Start()),
101		fLength(entry->Length()),
102		fLogPosition(logPosition),
103		fStarted(started)
104	{
105		Initialized();
106	}
107
108	virtual void AddDump(TraceOutput& out)
109	{
110#ifdef BFS_DEBUGGER_COMMANDS
111		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
112			"%lu\n", fStarted ? "Started" : "Written", fEntry,
113			fTransactionID, fStart, fLength,
114			fStarted ? "end" : "start", fLogPosition);
115#else
116		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
117			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
118			fStarted ? "end" : "start", fLogPosition);
119#endif
120	}
121
122private:
123	::LogEntry*	fEntry;
124#ifdef BFS_DEBUGGER_COMMANDS
125	int32		fTransactionID;
126#endif
127	uint32		fStart;
128	uint32		fLength;
129	uint32		fLogPosition;
130	bool		fStarted;
131};
132
133}	// namespace BFSJournalTracing
134
135#	define T(x) new(std::nothrow) BFSJournalTracing::x;
136#else
137#	define T(x) ;
138#endif
139
140
141//	#pragma mark -
142
143
144static void
145add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
146	size_t size)
147{
148	if (index > 0 && (addr_t)vecs[index - 1].iov_base
149			+ vecs[index - 1].iov_len == (addr_t)address) {
150		// the iovec can be combined with the previous one
151		vecs[index - 1].iov_len += size;
152		return;
153	}
154
155	if (index == max)
156		panic("no more space for iovecs!");
157
158	// we need to start a new iovec
159	vecs[index].iov_base = const_cast<void*>(address);
160	vecs[index].iov_len = size;
161	index++;
162}
163
164
165//	#pragma mark - LogEntry
166
167
168LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
169	:
170	fJournal(journal),
171	fStart(start),
172	fLength(length)
173{
174}
175
176
177LogEntry::~LogEntry()
178{
179}
180
181
182//	#pragma mark - run_array
183
184
185/*!	The run_array's size equals the block size of the BFS volume, so we
186	cannot use a (non-overridden) new.
187	This makes a freshly allocated run_array ready to run.
188*/
189void
190run_array::Init(int32 blockSize)
191{
192	memset(this, 0, blockSize);
193	count = 0;
194	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
195}
196
197
198/*!	Inserts the block_run into the array. You will have to make sure the
199	array is large enough to contain the entry before calling this function.
200*/
201void
202run_array::Insert(block_run& run)
203{
204	int32 index = _FindInsertionIndex(run);
205	if (index == -1) {
206		// add to the end
207		runs[CountRuns()] = run;
208	} else {
209		// insert at index
210		memmove(&runs[index + 1], &runs[index],
211			(CountRuns() - index) * sizeof(off_t));
212		runs[index] = run;
213	}
214
215	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
216}
217
218
219/*static*/ int32
220run_array::MaxRuns(int32 blockSize)
221{
222	// For whatever reason, BFS restricts the maximum array size
223	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
224	if (maxCount < 128)
225		return maxCount;
226
227	return 127;
228}
229
230
231/*static*/ int
232run_array::_Compare(block_run& a, block_run& b)
233{
234	int cmp = a.AllocationGroup() - b.AllocationGroup();
235	if (cmp == 0)
236		return a.Start() - b.Start();
237
238	return cmp;
239}
240
241
242int32
243run_array::_FindInsertionIndex(block_run& run)
244{
245	int32 min = 0, max = CountRuns() - 1;
246	int32 i = 0;
247	if (max >= 8) {
248		while (min <= max) {
249			i = (min + max) / 2;
250
251			int cmp = _Compare(runs[i], run);
252			if (cmp < 0)
253				min = i + 1;
254			else if (cmp > 0)
255				max = i - 1;
256			else
257				return -1;
258		}
259
260		if (_Compare(runs[i], run) < 0)
261			i++;
262	} else {
263		for (; i <= max; i++) {
264			if (_Compare(runs[i], run) > 0)
265				break;
266		}
267		if (i == count)
268			return -1;
269	}
270
271	return i;
272}
273
274
275//	#pragma mark - RunArrays
276
277
278RunArrays::RunArrays(Journal* journal)
279	:
280	fJournal(journal),
281	fBlockCount(0),
282	fArrays(),
283	fLastArray(NULL)
284{
285}
286
287
288RunArrays::~RunArrays()
289{
290	run_array* array;
291	while (fArrays.Pop(&array))
292		free(array);
293}
294
295
296bool
297RunArrays::_ContainsRun(block_run& run)
298{
299	for (int32 i = 0; i < CountArrays(); i++) {
300		run_array* array = ArrayAt(i);
301
302		for (int32 j = 0; j < array->CountRuns(); j++) {
303			block_run& arrayRun = array->runs[j];
304			if (run.AllocationGroup() != arrayRun.AllocationGroup())
305				continue;
306
307			if (run.Start() >= arrayRun.Start()
308				&& run.Start() + run.Length()
309					<= arrayRun.Start() + arrayRun.Length())
310				return true;
311		}
312	}
313
314	return false;
315}
316
317
318/*!	Adds the specified block_run into the array.
319	Note: it doesn't support overlapping - it must only be used
320	with block_runs of length 1!
321*/
322bool
323RunArrays::_AddRun(block_run& run)
324{
325	ASSERT(run.length == 1);
326
327	// Be's BFS log replay routine can only deal with block_runs of size 1
328	// A pity, isn't it? Too sad we have to be compatible.
329
330	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
331		return false;
332
333	fLastArray->Insert(run);
334	fBlockCount++;
335	return true;
336}
337
338
339status_t
340RunArrays::_AddArray()
341{
342	int32 blockSize = fJournal->GetVolume()->BlockSize();
343
344	run_array* array = (run_array*)malloc(blockSize);
345	if (array == NULL)
346		return B_NO_MEMORY;
347
348	if (fArrays.Push(array) != B_OK) {
349		free(array);
350		return B_NO_MEMORY;
351	}
352
353	array->Init(blockSize);
354	fLastArray = array;
355	return B_OK;
356}
357
358
359status_t
360RunArrays::Insert(off_t blockNumber)
361{
362	Volume* volume = fJournal->GetVolume();
363	block_run run = volume->ToBlockRun(blockNumber);
364
365	if (fLastArray != NULL) {
366		// check if the block is already in the array
367		if (_ContainsRun(run))
368			return B_OK;
369	}
370
371	// insert block into array
372
373	if (!_AddRun(run)) {
374		// array is full
375		if (_AddArray() != B_OK || !_AddRun(run))
376			return B_NO_MEMORY;
377	}
378
379	return B_OK;
380}
381
382
383int32
384RunArrays::MaxArrayLength()
385{
386	int32 max = 0;
387	for (int32 i = 0; i < CountArrays(); i++) {
388		if (ArrayAt(i)->CountRuns() > max)
389			max = ArrayAt(i)->CountRuns();
390	}
391
392	return max;
393}
394
395
396//	#pragma mark - Journal
397
398
399Journal::Journal(Volume* volume)
400	:
401	fVolume(volume),
402	fOwner(NULL),
403	fLogSize(volume->Log().Length()),
404	fMaxTransactionSize(fLogSize / 2 - 5),
405	fUsed(0),
406	fUnwrittenTransactions(0),
407	fHasSubtransaction(false),
408	fSeparateSubTransactions(false)
409{
410	recursive_lock_init(&fLock, "bfs journal");
411	mutex_init(&fEntriesLock, "bfs journal entries");
412}
413
414
415Journal::~Journal()
416{
417	FlushLogAndBlocks();
418
419	recursive_lock_destroy(&fLock);
420	mutex_destroy(&fEntriesLock);
421}
422
423
424status_t
425Journal::InitCheck()
426{
427	return B_OK;
428}
429
430
431/*!	\brief Does a very basic consistency check of the run array.
432	It will check the maximum run count as well as if all of the runs fall
433	within a the volume.
434*/
435status_t
436Journal::_CheckRunArray(const run_array* array)
437{
438	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
439		// the -1 works around an off-by-one bug in Be's BFS implementation,
440		// same as in run_array::MaxRuns()
441	if (array->MaxRuns() != maxRuns
442		|| array->CountRuns() > maxRuns
443		|| array->CountRuns() <= 0) {
444		dprintf("run count: %d, array max: %d, max runs: %d\n",
445			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
446		FATAL(("Log entry has broken header!\n"));
447		return B_ERROR;
448	}
449
450	for (int32 i = 0; i < array->CountRuns(); i++) {
451		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
452			return B_ERROR;
453	}
454
455	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
456	return B_OK;
457}
458
459
460/*!	Replays an entry in the log.
461	\a _start points to the entry in the log, and will be bumped to the next
462	one if replaying succeeded.
463*/
464status_t
465Journal::_ReplayRunArray(int32* _start)
466{
467	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
468
469	off_t logOffset = fVolume->ToBlock(fVolume->Log());
470	off_t firstBlockNumber = *_start % fLogSize;
471
472	CachedBlock cachedArray(fVolume);
473
474	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
475		+ firstBlockNumber);
476	if (array == NULL)
477		return B_IO_ERROR;
478
479	if (_CheckRunArray(array) < B_OK)
480		return B_BAD_DATA;
481
482	// First pass: check integrity of the blocks in the run array
483
484	CachedBlock cached(fVolume);
485
486	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
487	off_t blockNumber = firstBlockNumber;
488	int32 blockSize = fVolume->BlockSize();
489
490	for (int32 index = 0; index < array->CountRuns(); index++) {
491		const block_run& run = array->RunAt(index);
492
493		off_t offset = fVolume->ToOffset(run);
494		for (int32 i = 0; i < run.Length(); i++) {
495			const uint8* data = cached.SetTo(logOffset + blockNumber);
496			if (data == NULL)
497				RETURN_ERROR(B_IO_ERROR);
498
499			// TODO: eventually check other well known offsets, like the
500			// root and index dirs
501			if (offset == 0) {
502				// This log entry writes over the superblock - check if
503				// it's valid!
504				if (Volume::CheckSuperBlock(data) != B_OK) {
505					FATAL(("Log contains invalid superblock!\n"));
506					RETURN_ERROR(B_BAD_DATA);
507				}
508			}
509
510			blockNumber = (blockNumber + 1) % fLogSize;
511			offset += blockSize;
512		}
513	}
514
515	// Second pass: write back its blocks
516
517	blockNumber = firstBlockNumber;
518	int32 count = 1;
519
520	for (int32 index = 0; index < array->CountRuns(); index++) {
521		const block_run& run = array->RunAt(index);
522		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
523			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
524
525		off_t offset = fVolume->ToOffset(run);
526		for (int32 i = 0; i < run.Length(); i++) {
527			const uint8* data = cached.SetTo(logOffset + blockNumber);
528			if (data == NULL)
529				RETURN_ERROR(B_IO_ERROR);
530
531			ssize_t written = write_pos(fVolume->Device(), offset, data,
532				blockSize);
533			if (written != blockSize)
534				RETURN_ERROR(B_IO_ERROR);
535
536			blockNumber = (blockNumber + 1) % fLogSize;
537			offset += blockSize;
538			count++;
539		}
540	}
541
542	*_start += count;
543	return B_OK;
544}
545
546
547/*!	Replays all log entries - this will put the disk into a
548	consistent and clean state, if it was not correctly unmounted
549	before.
550	This method is called by Journal::InitCheck() if the log start
551	and end pointer don't match.
552*/
553status_t
554Journal::ReplayLog()
555{
556	// TODO: this logic won't work whenever the size of the pending transaction
557	//	equals the size of the log (happens with the original BFS only)
558	if (fVolume->LogStart() == fVolume->LogEnd())
559		return B_OK;
560
561	INFORM(("Replay log, disk was not correctly unmounted...\n"));
562
563	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
564		INFORM(("log_start and log_end differ, but disk is marked clean - "
565			"trying to replay log...\n"));
566	}
567
568	if (fVolume->IsReadOnly())
569		return B_READ_ONLY_DEVICE;
570
571	int32 start = fVolume->LogStart();
572	int32 lastStart = -1;
573	while (true) {
574		// stop if the log is completely flushed
575		if (start == fVolume->LogEnd())
576			break;
577
578		if (start == lastStart) {
579			// strange, flushing the log hasn't changed the log_start pointer
580			return B_ERROR;
581		}
582		lastStart = start;
583
584		status_t status = _ReplayRunArray(&start);
585		if (status != B_OK) {
586			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
587				strerror(status)));
588			return B_ERROR;
589		}
590		start = start % fLogSize;
591	}
592
593	PRINT(("replaying worked fine!\n"));
594	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
595		fVolume->LogEnd());
596	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
597	fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32(
598		SUPER_BLOCK_DISK_CLEAN);
599
600	return fVolume->WriteSuperBlock();
601}
602
603
604size_t
605Journal::CurrentTransactionSize() const
606{
607	if (_HasSubTransaction()) {
608		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
609			fTransactionID);
610	}
611
612	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
613		fTransactionID);
614}
615
616
617bool
618Journal::CurrentTransactionTooLarge() const
619{
620	return CurrentTransactionSize() > fLogSize;
621}
622
623
624/*!	This is a callback function that is called by the cache, whenever
625	all blocks of a transaction have been flushed to disk.
626	This lets us keep track of completed transactions, and update
627	the log start pointer as needed. Note, the transactions may not be
628	completed in the order they were written.
629*/
630/*static*/ void
631Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
632{
633	LogEntry* logEntry = (LogEntry*)_logEntry;
634
635	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
636		transactionID));
637
638	Journal* journal = logEntry->GetJournal();
639	disk_super_block& superBlock = journal->fVolume->SuperBlock();
640	bool update = false;
641
642	// Set log_start pointer if possible...
643
644	mutex_lock(&journal->fEntriesLock);
645
646	if (logEntry == journal->fEntries.First()) {
647		LogEntry* next = journal->fEntries.GetNext(logEntry);
648		if (next != NULL) {
649			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
650				% journal->fLogSize);
651		} else {
652			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
653				journal->fVolume->LogEnd());
654		}
655
656		update = true;
657	}
658
659	T(LogEntry(logEntry, superBlock.LogStart(), false));
660
661	journal->fUsed -= logEntry->Length();
662	journal->fEntries.Remove(logEntry);
663	mutex_unlock(&journal->fEntriesLock);
664
665	delete logEntry;
666
667	// update the superblock, and change the disk's state, if necessary
668
669	if (update) {
670		if (superBlock.log_start == superBlock.log_end)
671			superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
672
673		status_t status = journal->fVolume->WriteSuperBlock();
674		if (status != B_OK) {
675			FATAL(("_TransactionWritten: could not write back superblock: %s\n",
676				strerror(status)));
677		}
678
679		journal->fVolume->LogStart() = superBlock.LogStart();
680	}
681}
682
683
684/*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
685/*static*/ void
686Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
687{
688	// The current transaction seems to be idle - flush it. We can't do this
689	// in this thread, as flushing the log can produce new transaction events.
690	thread_id id = spawn_kernel_thread(&Journal::_FlushLog, "bfs log flusher",
691		B_NORMAL_PRIORITY, _journal);
692	if (id > 0)
693		resume_thread(id);
694}
695
696
697/*static*/ status_t
698Journal::_FlushLog(void* _journal)
699{
700	Journal* journal = (Journal*)_journal;
701	return journal->_FlushLog(false, false);
702}
703
704
705/*!	Writes the blocks that are part of current transaction into the log,
706	and ends the current transaction.
707	If the current transaction is too large to fit into the log, it will
708	try to detach an existing sub-transaction.
709*/
710status_t
711Journal::_WriteTransactionToLog()
712{
713	// TODO: in case of a failure, we need a backup plan like writing all
714	//	changed blocks back to disk immediately (hello disk corruption!)
715
716	bool detached = false;
717
718	if (_TransactionSize() > fLogSize) {
719		// The current transaction won't fit into the log anymore, try to
720		// detach the current sub-transaction
721		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
722				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
723			detached = true;
724		} else {
725			// We created a transaction larger than one we can write back to
726			// disk - the only option we have (besides risking disk corruption
727			// by writing it back anyway), is to let it fail.
728			dprintf("transaction too large (%d blocks, log size %d)!\n",
729				(int)_TransactionSize(), (int)fLogSize);
730			return B_BUFFER_OVERFLOW;
731		}
732	}
733
734	fHasSubtransaction = false;
735
736	int32 blockShift = fVolume->BlockShift();
737	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
738	off_t logStart = fVolume->LogEnd() % fLogSize;
739	off_t logPosition = logStart;
740	status_t status;
741
742	// create run_array structures for all changed blocks
743
744	RunArrays runArrays(this);
745
746	off_t blockNumber;
747	long cookie = 0;
748	while (cache_next_block_in_transaction(fVolume->BlockCache(),
749			fTransactionID, detached, &cookie, &blockNumber, NULL,
750			NULL) == B_OK) {
751		status = runArrays.Insert(blockNumber);
752		if (status < B_OK) {
753			FATAL(("filling log entry failed!"));
754			return status;
755		}
756	}
757
758	if (runArrays.CountBlocks() == 0) {
759		// nothing has changed during this transaction
760		if (detached) {
761			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
762				fTransactionID, NULL, NULL);
763			fUnwrittenTransactions = 1;
764		} else {
765			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
766				NULL);
767			fUnwrittenTransactions = 0;
768		}
769		return B_OK;
770	}
771
772	// If necessary, flush the log, so that we have enough space for this
773	// transaction
774	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
775		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
776		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
777			panic("no space in log after sync (%ld for %ld blocks)!",
778				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
779		}
780	}
781
782	// Write log entries to disk
783
784	int32 maxVecs = runArrays.MaxArrayLength() + 1;
785		// one extra for the index block
786
787	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
788	if (vecs == NULL) {
789		// TODO: write back log entries directly?
790		return B_NO_MEMORY;
791	}
792
793	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
794		run_array* array = runArrays.ArrayAt(k);
795		int32 index = 0, count = 1;
796		int32 wrap = fLogSize - logStart;
797
798		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
799
800		// add block runs
801
802		for (int32 i = 0; i < array->CountRuns(); i++) {
803			const block_run& run = array->RunAt(i);
804			off_t blockNumber = fVolume->ToBlock(run);
805
806			for (int32 j = 0; j < run.Length(); j++) {
807				if (count >= wrap) {
808					// We need to write back the first half of the entry
809					// directly as the log wraps around
810					if (writev_pos(fVolume->Device(), logOffset
811						+ (logStart << blockShift), vecs, index) < 0)
812						FATAL(("could not write log area!\n"));
813
814					logPosition = logStart + count;
815					logStart = 0;
816					wrap = fLogSize;
817					count = 0;
818					index = 0;
819				}
820
821				// make blocks available in the cache
822				const void* data = block_cache_get(fVolume->BlockCache(),
823					blockNumber + j);
824				if (data == NULL) {
825					free(vecs);
826					return B_IO_ERROR;
827				}
828
829				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
830				count++;
831			}
832		}
833
834		// write back the rest of the log entry
835		if (count > 0) {
836			logPosition = logStart + count;
837			if (writev_pos(fVolume->Device(), logOffset
838					+ (logStart << blockShift), vecs, index) < 0)
839				FATAL(("could not write log area: %s!\n", strerror(errno)));
840		}
841
842		// release blocks again
843		for (int32 i = 0; i < array->CountRuns(); i++) {
844			const block_run& run = array->RunAt(i);
845			off_t blockNumber = fVolume->ToBlock(run);
846
847			for (int32 j = 0; j < run.Length(); j++) {
848				block_cache_put(fVolume->BlockCache(), blockNumber + j);
849			}
850		}
851
852		logStart = logPosition % fLogSize;
853	}
854
855	free(vecs);
856
857	LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(),
858		runArrays.LogEntryLength());
859	if (logEntry == NULL) {
860		FATAL(("no memory to allocate log entries!"));
861		return B_NO_MEMORY;
862	}
863
864#ifdef BFS_DEBUGGER_COMMANDS
865	logEntry->SetTransactionID(fTransactionID);
866#endif
867
868	// Update the log end pointer in the superblock
869
870	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
871	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
872
873	status = fVolume->WriteSuperBlock();
874
875	fVolume->LogEnd() = logPosition;
876	T(LogEntry(logEntry, fVolume->LogEnd(), true));
877
878	// We need to flush the drives own cache here to ensure
879	// disk consistency.
880	// If that call fails, we can't do anything about it anyway
881	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
882
883	// at this point, we can finally end the transaction - we're in
884	// a guaranteed valid state
885
886	mutex_lock(&fEntriesLock);
887	fEntries.Add(logEntry);
888	fUsed += logEntry->Length();
889	mutex_unlock(&fEntriesLock);
890
891	if (detached) {
892		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
893			fTransactionID, _TransactionWritten, logEntry);
894		fUnwrittenTransactions = 1;
895
896		if (status == B_OK && _TransactionSize() > fLogSize) {
897			// If the transaction is too large after writing, there is no way to
898			// recover, so let this transaction fail.
899			dprintf("transaction too large (%d blocks, log size %d)!\n",
900				(int)_TransactionSize(), (int)fLogSize);
901			return B_BUFFER_OVERFLOW;
902		}
903	} else {
904		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
905			_TransactionWritten, logEntry);
906		fUnwrittenTransactions = 0;
907	}
908
909	return status;
910}
911
912
913/*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
914	also write back all dirty blocks for this volume.
915*/
916status_t
917Journal::_FlushLog(bool canWait, bool flushBlocks)
918{
919	status_t status = canWait ? recursive_lock_lock(&fLock)
920		: recursive_lock_trylock(&fLock);
921	if (status != B_OK)
922		return status;
923
924	if (recursive_lock_get_recursion(&fLock) > 1) {
925		// whoa, FlushLogAndBlocks() was called from inside a transaction
926		recursive_lock_unlock(&fLock);
927		return B_OK;
928	}
929
930	// write the current log entry to disk
931
932	if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) {
933		status = _WriteTransactionToLog();
934		if (status < B_OK)
935			FATAL(("writing current log entry failed: %s\n", strerror(status)));
936	}
937
938	if (flushBlocks)
939		status = fVolume->FlushDevice();
940
941	recursive_lock_unlock(&fLock);
942	return status;
943}
944
945
946/*!	Flushes the current log entry to disk, and also writes back all dirty
947	blocks for this volume (completing all open transactions).
948*/
949status_t
950Journal::FlushLogAndBlocks()
951{
952	return _FlushLog(true, true);
953}
954
955
956status_t
957Journal::Lock(Transaction* owner, bool separateSubTransactions)
958{
959	status_t status = recursive_lock_lock(&fLock);
960	if (status != B_OK)
961		return status;
962
963	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
964		// we'll just use the current transaction again
965		return B_OK;
966	}
967
968	if (separateSubTransactions)
969		fSeparateSubTransactions = true;
970
971	if (owner != NULL)
972		owner->SetParent(fOwner);
973
974	fOwner = owner;
975
976	// TODO: we need a way to find out how big the current transaction is;
977	//	we need to be able to either detach the latest sub transaction on
978	//	demand, as well as having some kind of fall back plan in case the
979	//	sub transaction itself grows bigger than the log.
980	//	For that, it would be nice to have some call-back interface in the
981	//	cache transaction API...
982
983	if (fOwner != NULL) {
984		if (fUnwrittenTransactions > 0) {
985			// start a sub transaction
986			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
987			fHasSubtransaction = true;
988		} else
989			fTransactionID = cache_start_transaction(fVolume->BlockCache());
990
991		if (fTransactionID < B_OK) {
992			recursive_lock_unlock(&fLock);
993			return fTransactionID;
994		}
995
996		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
997			TRANSACTION_IDLE, _TransactionIdle, this);
998	}
999	return B_OK;
1000}
1001
1002
1003status_t
1004Journal::Unlock(Transaction* owner, bool success)
1005{
1006	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1007		// we only end the transaction if we would really unlock it
1008		// TODO: what about failing transactions that do not unlock?
1009		// (they must make the parent fail, too)
1010		if (owner != NULL) {
1011			status_t status = _TransactionDone(success);
1012			if (status != B_OK)
1013				return status;
1014
1015			// Unlocking the inodes might trigger new transactions, but we
1016			// cannot reuse the current one anymore, as this one is already
1017			// closed.
1018			bool separateSubTransactions = fSeparateSubTransactions;
1019			fSeparateSubTransactions = true;
1020			owner->NotifyListeners(success);
1021			fSeparateSubTransactions = separateSubTransactions;
1022
1023			fOwner = owner->Parent();
1024		} else
1025			fOwner = NULL;
1026
1027		fTimestamp = system_time();
1028
1029		if (fSeparateSubTransactions
1030			&& recursive_lock_get_recursion(&fLock) == 1)
1031			fSeparateSubTransactions = false;
1032	} else
1033		owner->MoveListenersTo(fOwner);
1034
1035	recursive_lock_unlock(&fLock);
1036	return B_OK;
1037}
1038
1039
1040uint32
1041Journal::_TransactionSize() const
1042{
1043	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1044		fTransactionID);
1045	if (count <= 0)
1046		return 0;
1047
1048	// take the number of array blocks in this transaction into account
1049	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1050	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1051	return count + arrayBlocks;
1052}
1053
1054
1055status_t
1056Journal::_TransactionDone(bool success)
1057{
1058	if (!success) {
1059		if (_HasSubTransaction()) {
1060			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1061			// We can continue to use the parent transaction afterwards
1062		} else {
1063			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1064			fUnwrittenTransactions = 0;
1065		}
1066
1067		return B_OK;
1068	}
1069
1070	// Up to a maximum size, we will just batch several
1071	// transactions together to improve speed
1072	uint32 size = _TransactionSize();
1073	if (size < fMaxTransactionSize) {
1074		// Flush the log from time to time, so that we have enough space
1075		// for this transaction
1076		if (size > FreeLogBlocks())
1077			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1078
1079		fUnwrittenTransactions++;
1080		return B_OK;
1081	}
1082
1083	return _WriteTransactionToLog();
1084}
1085
1086
1087//	#pragma mark - debugger commands
1088
1089
1090#ifdef BFS_DEBUGGER_COMMANDS
1091
1092
1093void
1094Journal::Dump()
1095{
1096	kprintf("Journal %p\n", this);
1097	kprintf("  log start:            %" B_PRId32 "\n", fVolume->LogStart());
1098	kprintf("  log end:              %" B_PRId32 "\n", fVolume->LogEnd());
1099	kprintf("  owner:                %p\n", fOwner);
1100	kprintf("  log size:             %" B_PRIu32 "\n", fLogSize);
1101	kprintf("  max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize);
1102	kprintf("  used:                 %" B_PRIu32 "\n", fUsed);
1103	kprintf("  unwritten:            %" B_PRId32 "\n", fUnwrittenTransactions);
1104	kprintf("  timestamp:            %" B_PRId64 "\n", fTimestamp);
1105	kprintf("  transaction ID:       %" B_PRId32 "\n", fTransactionID);
1106	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1107	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1108	kprintf("entries:\n");
1109	kprintf("  address        id  start length\n");
1110
1111	LogEntryList::Iterator iterator = fEntries.GetIterator();
1112
1113	while (iterator.HasNext()) {
1114		LogEntry* entry = iterator.Next();
1115
1116		kprintf("  %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry,
1117			entry->TransactionID(), entry->Start(), entry->Length());
1118	}
1119}
1120
1121
1122int
1123dump_journal(int argc, char** argv)
1124{
1125	if (argc != 2 || !strcmp(argv[1], "--help")) {
1126		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1127		return 0;
1128	}
1129
1130	Volume* volume = (Volume*)parse_expression(argv[1]);
1131	Journal* journal = volume->GetJournal(0);
1132
1133	journal->Dump();
1134	return 0;
1135}
1136
1137
1138#endif	// BFS_DEBUGGER_COMMANDS
1139
1140
1141//	#pragma mark - TransactionListener
1142
1143
1144TransactionListener::TransactionListener()
1145{
1146}
1147
1148
1149TransactionListener::~TransactionListener()
1150{
1151}
1152
1153
1154//	#pragma mark - Transaction
1155
1156
1157status_t
1158Transaction::Start(Volume* volume, off_t refBlock)
1159{
1160	// has it already been started?
1161	if (fJournal != NULL)
1162		return B_OK;
1163
1164	fJournal = volume->GetJournal(refBlock);
1165	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1166		return B_OK;
1167
1168	fJournal = NULL;
1169	return B_ERROR;
1170}
1171
1172
1173void
1174Transaction::AddListener(TransactionListener* listener)
1175{
1176	if (fJournal == NULL)
1177		panic("Transaction is not running!");
1178
1179	fListeners.Add(listener);
1180}
1181
1182
1183void
1184Transaction::RemoveListener(TransactionListener* listener)
1185{
1186	if (fJournal == NULL)
1187		panic("Transaction is not running!");
1188
1189	fListeners.Remove(listener);
1190	listener->RemovedFromTransaction();
1191}
1192
1193
1194void
1195Transaction::NotifyListeners(bool success)
1196{
1197	while (TransactionListener* listener = fListeners.RemoveHead()) {
1198		listener->TransactionDone(success);
1199		listener->RemovedFromTransaction();
1200	}
1201}
1202
1203
1204/*!	Move the inodes into the parent transaction. This is needed only to make
1205	sure they will still be reverted in case the transaction is aborted.
1206*/
1207void
1208Transaction::MoveListenersTo(Transaction* transaction)
1209{
1210	while (TransactionListener* listener = fListeners.RemoveHead()) {
1211		transaction->fListeners.Add(listener);
1212	}
1213}
1214