1/*
2 * Copyright 2001-2017, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 */
5
6
7//! Transaction and logging
8
9
10#include "Journal.h"
11
12#include "Debug.h"
13#include "Inode.h"
14
15
16struct run_array {
17	int32		count;
18	int32		max_runs;
19	block_run	runs[0];
20
21	void Init(int32 blockSize);
22	void Insert(block_run& run);
23
24	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
25	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
26		// that -1 accounts for an off-by-one error in Be's BFS implementation
27	const block_run& RunAt(int32 i) const { return runs[i]; }
28
29	static int32 MaxRuns(int32 blockSize);
30
31private:
32	static int _Compare(block_run& a, block_run& b);
33	int32 _FindInsertionIndex(block_run& run);
34};
35
36class RunArrays {
37public:
38							RunArrays(Journal* journal);
39							~RunArrays();
40
41			status_t		Insert(off_t blockNumber);
42
43			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
44			int32			CountArrays() const { return fArrays.CountItems(); }
45
46			uint32			CountBlocks() const { return fBlockCount; }
47			uint32			LogEntryLength() const
48								{ return CountBlocks() + CountArrays(); }
49
50			int32			MaxArrayLength();
51
52private:
53			status_t		_AddArray();
54			bool			_ContainsRun(block_run& run);
55			bool			_AddRun(block_run& run);
56
57			Journal*		fJournal;
58			uint32			fBlockCount;
59			Stack<run_array*> fArrays;
60			run_array*		fLastArray;
61};
62
63class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
64public:
65							LogEntry(Journal* journal, uint32 logStart,
66								uint32 length);
67							~LogEntry();
68
69			uint32			Start() const { return fStart; }
70			uint32			Length() const { return fLength; }
71
72#ifdef BFS_DEBUGGER_COMMANDS
73			void			SetTransactionID(int32 id) { fTransactionID = id; }
74			int32			TransactionID() const { return fTransactionID; }
75#endif
76
77			Journal*		GetJournal() { return fJournal; }
78
79private:
80			Journal*		fJournal;
81			uint32			fStart;
82			uint32			fLength;
83#ifdef BFS_DEBUGGER_COMMANDS
84			int32			fTransactionID;
85#endif
86};
87
88
89#if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
90namespace BFSJournalTracing {
91
92class LogEntry : public AbstractTraceEntry {
93public:
94	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
95		:
96		fEntry(entry),
97#ifdef BFS_DEBUGGER_COMMANDS
98		fTransactionID(entry->TransactionID()),
99#endif
100		fStart(entry->Start()),
101		fLength(entry->Length()),
102		fLogPosition(logPosition),
103		fStarted(started)
104	{
105		Initialized();
106	}
107
108	virtual void AddDump(TraceOutput& out)
109	{
110#ifdef BFS_DEBUGGER_COMMANDS
111		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
112			"%lu\n", fStarted ? "Started" : "Written", fEntry,
113			fTransactionID, fStart, fLength,
114			fStarted ? "end" : "start", fLogPosition);
115#else
116		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
117			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
118			fStarted ? "end" : "start", fLogPosition);
119#endif
120	}
121
122private:
123	::LogEntry*	fEntry;
124#ifdef BFS_DEBUGGER_COMMANDS
125	int32		fTransactionID;
126#endif
127	uint32		fStart;
128	uint32		fLength;
129	uint32		fLogPosition;
130	bool		fStarted;
131};
132
133}	// namespace BFSJournalTracing
134
135#	define T(x) new(std::nothrow) BFSJournalTracing::x;
136#else
137#	define T(x) ;
138#endif
139
140
141//	#pragma mark -
142
143
144static void
145add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
146	size_t size)
147{
148	if (index > 0 && (addr_t)vecs[index - 1].iov_base
149			+ vecs[index - 1].iov_len == (addr_t)address) {
150		// the iovec can be combined with the previous one
151		vecs[index - 1].iov_len += size;
152		return;
153	}
154
155	if (index == max)
156		panic("no more space for iovecs!");
157
158	// we need to start a new iovec
159	vecs[index].iov_base = const_cast<void*>(address);
160	vecs[index].iov_len = size;
161	index++;
162}
163
164
165//	#pragma mark - LogEntry
166
167
168LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
169	:
170	fJournal(journal),
171	fStart(start),
172	fLength(length)
173{
174}
175
176
177LogEntry::~LogEntry()
178{
179}
180
181
182//	#pragma mark - run_array
183
184
185/*!	The run_array's size equals the block size of the BFS volume, so we
186	cannot use a (non-overridden) new.
187	This makes a freshly allocated run_array ready to run.
188*/
189void
190run_array::Init(int32 blockSize)
191{
192	memset(this, 0, blockSize);
193	count = 0;
194	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
195}
196
197
198/*!	Inserts the block_run into the array. You will have to make sure the
199	array is large enough to contain the entry before calling this function.
200*/
201void
202run_array::Insert(block_run& run)
203{
204	int32 index = _FindInsertionIndex(run);
205	if (index == -1) {
206		// add to the end
207		runs[CountRuns()] = run;
208	} else {
209		// insert at index
210		memmove(&runs[index + 1], &runs[index],
211			(CountRuns() - index) * sizeof(off_t));
212		runs[index] = run;
213	}
214
215	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
216}
217
218
219/*static*/ int32
220run_array::MaxRuns(int32 blockSize)
221{
222	// For whatever reason, BFS restricts the maximum array size
223	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
224	if (maxCount < 128)
225		return maxCount;
226
227	return 127;
228}
229
230
231/*static*/ int
232run_array::_Compare(block_run& a, block_run& b)
233{
234	int cmp = a.AllocationGroup() - b.AllocationGroup();
235	if (cmp == 0)
236		return a.Start() - b.Start();
237
238	return cmp;
239}
240
241
242int32
243run_array::_FindInsertionIndex(block_run& run)
244{
245	int32 min = 0, max = CountRuns() - 1;
246	int32 i = 0;
247	if (max >= 8) {
248		while (min <= max) {
249			i = (min + max) / 2;
250
251			int cmp = _Compare(runs[i], run);
252			if (cmp < 0)
253				min = i + 1;
254			else if (cmp > 0)
255				max = i - 1;
256			else
257				return -1;
258		}
259
260		if (_Compare(runs[i], run) < 0)
261			i++;
262	} else {
263		for (; i <= max; i++) {
264			if (_Compare(runs[i], run) > 0)
265				break;
266		}
267		if (i == count)
268			return -1;
269	}
270
271	return i;
272}
273
274
275//	#pragma mark - RunArrays
276
277
278RunArrays::RunArrays(Journal* journal)
279	:
280	fJournal(journal),
281	fBlockCount(0),
282	fArrays(),
283	fLastArray(NULL)
284{
285}
286
287
288RunArrays::~RunArrays()
289{
290	run_array* array;
291	while (fArrays.Pop(&array))
292		free(array);
293}
294
295
296bool
297RunArrays::_ContainsRun(block_run& run)
298{
299	for (int32 i = 0; i < CountArrays(); i++) {
300		run_array* array = ArrayAt(i);
301
302		for (int32 j = 0; j < array->CountRuns(); j++) {
303			block_run& arrayRun = array->runs[j];
304			if (run.AllocationGroup() != arrayRun.AllocationGroup())
305				continue;
306
307			if (run.Start() >= arrayRun.Start()
308				&& run.Start() + run.Length()
309					<= arrayRun.Start() + arrayRun.Length())
310				return true;
311		}
312	}
313
314	return false;
315}
316
317
318/*!	Adds the specified block_run into the array.
319	Note: it doesn't support overlapping - it must only be used
320	with block_runs of length 1!
321*/
322bool
323RunArrays::_AddRun(block_run& run)
324{
325	ASSERT(run.length == 1);
326
327	// Be's BFS log replay routine can only deal with block_runs of size 1
328	// A pity, isn't it? Too sad we have to be compatible.
329
330	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
331		return false;
332
333	fLastArray->Insert(run);
334	fBlockCount++;
335	return true;
336}
337
338
339status_t
340RunArrays::_AddArray()
341{
342	int32 blockSize = fJournal->GetVolume()->BlockSize();
343
344	run_array* array = (run_array*)malloc(blockSize);
345	if (array == NULL)
346		return B_NO_MEMORY;
347
348	if (fArrays.Push(array) != B_OK) {
349		free(array);
350		return B_NO_MEMORY;
351	}
352
353	array->Init(blockSize);
354	fLastArray = array;
355	return B_OK;
356}
357
358
359status_t
360RunArrays::Insert(off_t blockNumber)
361{
362	Volume* volume = fJournal->GetVolume();
363	block_run run = volume->ToBlockRun(blockNumber);
364
365	if (fLastArray != NULL) {
366		// check if the block is already in the array
367		if (_ContainsRun(run))
368			return B_OK;
369	}
370
371	// insert block into array
372
373	if (!_AddRun(run)) {
374		// array is full
375		if (_AddArray() != B_OK || !_AddRun(run))
376			return B_NO_MEMORY;
377	}
378
379	return B_OK;
380}
381
382
383int32
384RunArrays::MaxArrayLength()
385{
386	int32 max = 0;
387	for (int32 i = 0; i < CountArrays(); i++) {
388		if (ArrayAt(i)->CountRuns() > max)
389			max = ArrayAt(i)->CountRuns();
390	}
391
392	return max;
393}
394
395
396//	#pragma mark - Journal
397
398
399Journal::Journal(Volume* volume)
400	:
401	fVolume(volume),
402	fOwner(NULL),
403	fLogSize(volume->Log().Length()),
404	fMaxTransactionSize(fLogSize / 2 - 5),
405	fUsed(0),
406	fUnwrittenTransactions(0),
407	fHasSubtransaction(false),
408	fSeparateSubTransactions(false)
409{
410	recursive_lock_init(&fLock, "bfs journal");
411	mutex_init(&fEntriesLock, "bfs journal entries");
412
413	fLogFlusherSem = create_sem(0, "bfs log flusher");
414	fLogFlusher = spawn_kernel_thread(&Journal::_LogFlusher, "bfs log flusher",
415		B_NORMAL_PRIORITY, this);
416	if (fLogFlusher > 0)
417		resume_thread(fLogFlusher);
418}
419
420
421Journal::~Journal()
422{
423	FlushLogAndBlocks();
424
425	recursive_lock_destroy(&fLock);
426	mutex_destroy(&fEntriesLock);
427
428	sem_id logFlusher = fLogFlusherSem;
429	fLogFlusherSem = -1;
430	delete_sem(logFlusher);
431	wait_for_thread(fLogFlusher, NULL);
432}
433
434
435status_t
436Journal::InitCheck()
437{
438	return B_OK;
439}
440
441
442/*!	\brief Does a very basic consistency check of the run array.
443	It will check the maximum run count as well as if all of the runs fall
444	within a the volume.
445*/
446status_t
447Journal::_CheckRunArray(const run_array* array)
448{
449	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
450		// the -1 works around an off-by-one bug in Be's BFS implementation,
451		// same as in run_array::MaxRuns()
452	if (array->MaxRuns() != maxRuns
453		|| array->CountRuns() > maxRuns
454		|| array->CountRuns() <= 0) {
455		dprintf("run count: %d, array max: %d, max runs: %d\n",
456			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
457		FATAL(("Log entry has broken header!\n"));
458		return B_ERROR;
459	}
460
461	for (int32 i = 0; i < array->CountRuns(); i++) {
462		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
463			return B_ERROR;
464	}
465
466	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
467	return B_OK;
468}
469
470
471/*!	Replays an entry in the log.
472	\a _start points to the entry in the log, and will be bumped to the next
473	one if replaying succeeded.
474*/
475status_t
476Journal::_ReplayRunArray(int32* _start)
477{
478	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
479
480	off_t logOffset = fVolume->ToBlock(fVolume->Log());
481	off_t firstBlockNumber = *_start % fLogSize;
482
483	CachedBlock cachedArray(fVolume);
484
485	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
486		+ firstBlockNumber);
487	if (array == NULL)
488		return B_IO_ERROR;
489
490	if (_CheckRunArray(array) < B_OK)
491		return B_BAD_DATA;
492
493	// First pass: check integrity of the blocks in the run array
494
495	CachedBlock cached(fVolume);
496
497	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
498	off_t blockNumber = firstBlockNumber;
499	int32 blockSize = fVolume->BlockSize();
500
501	for (int32 index = 0; index < array->CountRuns(); index++) {
502		const block_run& run = array->RunAt(index);
503
504		off_t offset = fVolume->ToOffset(run);
505		for (int32 i = 0; i < run.Length(); i++) {
506			const uint8* data = cached.SetTo(logOffset + blockNumber);
507			if (data == NULL)
508				RETURN_ERROR(B_IO_ERROR);
509
510			// TODO: eventually check other well known offsets, like the
511			// root and index dirs
512			if (offset == 0) {
513				// This log entry writes over the superblock - check if
514				// it's valid!
515				if (Volume::CheckSuperBlock(data) != B_OK) {
516					FATAL(("Log contains invalid superblock!\n"));
517					RETURN_ERROR(B_BAD_DATA);
518				}
519			}
520
521			blockNumber = (blockNumber + 1) % fLogSize;
522			offset += blockSize;
523		}
524	}
525
526	// Second pass: write back its blocks
527
528	blockNumber = firstBlockNumber;
529	int32 count = 1;
530
531	for (int32 index = 0; index < array->CountRuns(); index++) {
532		const block_run& run = array->RunAt(index);
533		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
534			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
535
536		off_t offset = fVolume->ToOffset(run);
537		for (int32 i = 0; i < run.Length(); i++) {
538			const uint8* data = cached.SetTo(logOffset + blockNumber);
539			if (data == NULL)
540				RETURN_ERROR(B_IO_ERROR);
541
542			ssize_t written = write_pos(fVolume->Device(), offset, data,
543				blockSize);
544			if (written != blockSize)
545				RETURN_ERROR(B_IO_ERROR);
546
547			blockNumber = (blockNumber + 1) % fLogSize;
548			offset += blockSize;
549			count++;
550		}
551	}
552
553	*_start += count;
554	return B_OK;
555}
556
557
558/*!	Replays all log entries - this will put the disk into a
559	consistent and clean state, if it was not correctly unmounted
560	before.
561	This method is called by Journal::InitCheck() if the log start
562	and end pointer don't match.
563*/
564status_t
565Journal::ReplayLog()
566{
567	// TODO: this logic won't work whenever the size of the pending transaction
568	//	equals the size of the log (happens with the original BFS only)
569	if (fVolume->LogStart() == fVolume->LogEnd())
570		return B_OK;
571
572	INFORM(("Replay log, disk was not correctly unmounted...\n"));
573
574	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
575		INFORM(("log_start and log_end differ, but disk is marked clean - "
576			"trying to replay log...\n"));
577	}
578
579	if (fVolume->IsReadOnly())
580		return B_READ_ONLY_DEVICE;
581
582	int32 start = fVolume->LogStart();
583	int32 lastStart = -1;
584	while (true) {
585		// stop if the log is completely flushed
586		if (start == fVolume->LogEnd())
587			break;
588
589		if (start == lastStart) {
590			// strange, flushing the log hasn't changed the log_start pointer
591			return B_ERROR;
592		}
593		lastStart = start;
594
595		status_t status = _ReplayRunArray(&start);
596		if (status != B_OK) {
597			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
598				strerror(status)));
599			return B_ERROR;
600		}
601		start = start % fLogSize;
602	}
603
604	PRINT(("replaying worked fine!\n"));
605	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
606		fVolume->LogEnd());
607	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
608	fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32(
609		SUPER_BLOCK_DISK_CLEAN);
610
611	return fVolume->WriteSuperBlock();
612}
613
614
615size_t
616Journal::CurrentTransactionSize() const
617{
618	if (_HasSubTransaction()) {
619		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
620			fTransactionID);
621	}
622
623	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
624		fTransactionID);
625}
626
627
628bool
629Journal::CurrentTransactionTooLarge() const
630{
631	return CurrentTransactionSize() > fLogSize;
632}
633
634
635/*!	This is a callback function that is called by the cache, whenever
636	all blocks of a transaction have been flushed to disk.
637	This lets us keep track of completed transactions, and update
638	the log start pointer as needed. Note, the transactions may not be
639	completed in the order they were written.
640*/
641/*static*/ void
642Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
643{
644	LogEntry* logEntry = (LogEntry*)_logEntry;
645
646	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
647		transactionID));
648
649	Journal* journal = logEntry->GetJournal();
650	disk_super_block& superBlock = journal->fVolume->SuperBlock();
651	bool update = false;
652
653	// Set log_start pointer if possible...
654
655	mutex_lock(&journal->fEntriesLock);
656
657	if (logEntry == journal->fEntries.First()) {
658		LogEntry* next = journal->fEntries.GetNext(logEntry);
659		if (next != NULL) {
660			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
661				% journal->fLogSize);
662		} else {
663			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
664				journal->fVolume->LogEnd());
665		}
666
667		update = true;
668	}
669
670	T(LogEntry(logEntry, superBlock.LogStart(), false));
671
672	journal->fUsed -= logEntry->Length();
673	journal->fEntries.Remove(logEntry);
674	mutex_unlock(&journal->fEntriesLock);
675
676	delete logEntry;
677
678	// update the superblock, and change the disk's state, if necessary
679
680	if (update) {
681		if (superBlock.log_start == superBlock.log_end)
682			superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
683
684		status_t status = journal->fVolume->WriteSuperBlock();
685		if (status != B_OK) {
686			FATAL(("_TransactionWritten: could not write back superblock: %s\n",
687				strerror(status)));
688		}
689
690		journal->fVolume->LogStart() = superBlock.LogStart();
691	}
692}
693
694
695/*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
696/*static*/ void
697Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
698{
699	// The current transaction seems to be idle - flush it. (We can't do this
700	// in this thread, as flushing the log can produce new transaction events.)
701	Journal* journal = (Journal*)_journal;
702	release_sem(journal->fLogFlusherSem);
703}
704
705
706/*static*/ status_t
707Journal::_LogFlusher(void* _journal)
708{
709	Journal* journal = (Journal*)_journal;
710	while (journal->fLogFlusherSem >= 0) {
711		if (acquire_sem(journal->fLogFlusherSem) != B_OK)
712			continue;
713
714		journal->_FlushLog(false, false);
715	}
716	return B_OK;
717}
718
719
720/*!	Writes the blocks that are part of current transaction into the log,
721	and ends the current transaction.
722	If the current transaction is too large to fit into the log, it will
723	try to detach an existing sub-transaction.
724*/
725status_t
726Journal::_WriteTransactionToLog()
727{
728	// TODO: in case of a failure, we need a backup plan like writing all
729	//	changed blocks back to disk immediately (hello disk corruption!)
730
731	bool detached = false;
732
733	if (_TransactionSize() > fLogSize) {
734		// The current transaction won't fit into the log anymore, try to
735		// detach the current sub-transaction
736		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
737				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
738			detached = true;
739		} else {
740			// We created a transaction larger than one we can write back to
741			// disk - the only option we have (besides risking disk corruption
742			// by writing it back anyway), is to let it fail.
743			dprintf("transaction too large (%d blocks, log size %d)!\n",
744				(int)_TransactionSize(), (int)fLogSize);
745			return B_BUFFER_OVERFLOW;
746		}
747	}
748
749	fHasSubtransaction = false;
750
751	int32 blockShift = fVolume->BlockShift();
752	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
753	off_t logStart = fVolume->LogEnd() % fLogSize;
754	off_t logPosition = logStart;
755	status_t status;
756
757	// create run_array structures for all changed blocks
758
759	RunArrays runArrays(this);
760
761	off_t blockNumber;
762	long cookie = 0;
763	while (cache_next_block_in_transaction(fVolume->BlockCache(),
764			fTransactionID, detached, &cookie, &blockNumber, NULL,
765			NULL) == B_OK) {
766		status = runArrays.Insert(blockNumber);
767		if (status < B_OK) {
768			FATAL(("filling log entry failed!"));
769			return status;
770		}
771	}
772
773	if (runArrays.CountBlocks() == 0) {
774		// nothing has changed during this transaction
775		if (detached) {
776			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
777				fTransactionID, NULL, NULL);
778			fUnwrittenTransactions = 1;
779		} else {
780			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
781				NULL);
782			fUnwrittenTransactions = 0;
783		}
784		return B_OK;
785	}
786
787	// If necessary, flush the log, so that we have enough space for this
788	// transaction
789	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
790		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
791		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
792			panic("no space in log after sync (%ld for %ld blocks)!",
793				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
794		}
795	}
796
797	// Write log entries to disk
798
799	int32 maxVecs = runArrays.MaxArrayLength() + 1;
800		// one extra for the index block
801
802	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
803	if (vecs == NULL) {
804		// TODO: write back log entries directly?
805		return B_NO_MEMORY;
806	}
807
808	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
809		run_array* array = runArrays.ArrayAt(k);
810		int32 index = 0, count = 1;
811		int32 wrap = fLogSize - logStart;
812
813		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
814
815		// add block runs
816
817		for (int32 i = 0; i < array->CountRuns(); i++) {
818			const block_run& run = array->RunAt(i);
819			off_t blockNumber = fVolume->ToBlock(run);
820
821			for (int32 j = 0; j < run.Length(); j++) {
822				if (count >= wrap) {
823					// We need to write back the first half of the entry
824					// directly as the log wraps around
825					if (writev_pos(fVolume->Device(), logOffset
826						+ (logStart << blockShift), vecs, index) < 0)
827						FATAL(("could not write log area!\n"));
828
829					logPosition = logStart + count;
830					logStart = 0;
831					wrap = fLogSize;
832					count = 0;
833					index = 0;
834				}
835
836				// make blocks available in the cache
837				const void* data = block_cache_get(fVolume->BlockCache(),
838					blockNumber + j);
839				if (data == NULL) {
840					free(vecs);
841					return B_IO_ERROR;
842				}
843
844				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
845				count++;
846			}
847		}
848
849		// write back the rest of the log entry
850		if (count > 0) {
851			logPosition = logStart + count;
852			if (writev_pos(fVolume->Device(), logOffset
853					+ (logStart << blockShift), vecs, index) < 0)
854				FATAL(("could not write log area: %s!\n", strerror(errno)));
855		}
856
857		// release blocks again
858		for (int32 i = 0; i < array->CountRuns(); i++) {
859			const block_run& run = array->RunAt(i);
860			off_t blockNumber = fVolume->ToBlock(run);
861
862			for (int32 j = 0; j < run.Length(); j++) {
863				block_cache_put(fVolume->BlockCache(), blockNumber + j);
864			}
865		}
866
867		logStart = logPosition % fLogSize;
868	}
869
870	free(vecs);
871
872	LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(),
873		runArrays.LogEntryLength());
874	if (logEntry == NULL) {
875		FATAL(("no memory to allocate log entries!"));
876		return B_NO_MEMORY;
877	}
878
879#ifdef BFS_DEBUGGER_COMMANDS
880	logEntry->SetTransactionID(fTransactionID);
881#endif
882
883	// Update the log end pointer in the superblock
884
885	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
886	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
887
888	status = fVolume->WriteSuperBlock();
889
890	fVolume->LogEnd() = logPosition;
891	T(LogEntry(logEntry, fVolume->LogEnd(), true));
892
893	// We need to flush the drives own cache here to ensure
894	// disk consistency.
895	// If that call fails, we can't do anything about it anyway
896	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
897
898	// at this point, we can finally end the transaction - we're in
899	// a guaranteed valid state
900
901	mutex_lock(&fEntriesLock);
902	fEntries.Add(logEntry);
903	fUsed += logEntry->Length();
904	mutex_unlock(&fEntriesLock);
905
906	if (detached) {
907		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
908			fTransactionID, _TransactionWritten, logEntry);
909		fUnwrittenTransactions = 1;
910
911		if (status == B_OK && _TransactionSize() > fLogSize) {
912			// If the transaction is too large after writing, there is no way to
913			// recover, so let this transaction fail.
914			dprintf("transaction too large (%d blocks, log size %d)!\n",
915				(int)_TransactionSize(), (int)fLogSize);
916			return B_BUFFER_OVERFLOW;
917		}
918	} else {
919		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
920			_TransactionWritten, logEntry);
921		fUnwrittenTransactions = 0;
922	}
923
924	return status;
925}
926
927
928/*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
929	also write back all dirty blocks for this volume.
930*/
931status_t
932Journal::_FlushLog(bool canWait, bool flushBlocks)
933{
934	status_t status = canWait ? recursive_lock_lock(&fLock)
935		: recursive_lock_trylock(&fLock);
936	if (status != B_OK)
937		return status;
938
939	if (recursive_lock_get_recursion(&fLock) > 1) {
940		// whoa, FlushLogAndBlocks() was called from inside a transaction
941		recursive_lock_unlock(&fLock);
942		return B_OK;
943	}
944
945	// write the current log entry to disk
946
947	if (fUnwrittenTransactions != 0) {
948		status = _WriteTransactionToLog();
949		if (status < B_OK)
950			FATAL(("writing current log entry failed: %s\n", strerror(status)));
951	}
952
953	if (flushBlocks)
954		status = fVolume->FlushDevice();
955
956	recursive_lock_unlock(&fLock);
957	return status;
958}
959
960
961/*!	Flushes the current log entry to disk, and also writes back all dirty
962	blocks for this volume (completing all open transactions).
963*/
964status_t
965Journal::FlushLogAndBlocks()
966{
967	return _FlushLog(true, true);
968}
969
970
971status_t
972Journal::Lock(Transaction* owner, bool separateSubTransactions)
973{
974	status_t status = recursive_lock_lock(&fLock);
975	if (status != B_OK)
976		return status;
977
978	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
979		// we'll just use the current transaction again
980		return B_OK;
981	}
982
983	if (separateSubTransactions)
984		fSeparateSubTransactions = true;
985
986	if (owner != NULL)
987		owner->SetParent(fOwner);
988
989	fOwner = owner;
990
991	// TODO: we need a way to find out how big the current transaction is;
992	//	we need to be able to either detach the latest sub transaction on
993	//	demand, as well as having some kind of fall back plan in case the
994	//	sub transaction itself grows bigger than the log.
995	//	For that, it would be nice to have some call-back interface in the
996	//	cache transaction API...
997
998	if (fOwner != NULL) {
999		if (fUnwrittenTransactions > 0) {
1000			// start a sub transaction
1001			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
1002			fHasSubtransaction = true;
1003		} else
1004			fTransactionID = cache_start_transaction(fVolume->BlockCache());
1005
1006		if (fTransactionID < B_OK) {
1007			recursive_lock_unlock(&fLock);
1008			return fTransactionID;
1009		}
1010
1011		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
1012			TRANSACTION_IDLE, _TransactionIdle, this);
1013	}
1014	return B_OK;
1015}
1016
1017
1018status_t
1019Journal::Unlock(Transaction* owner, bool success)
1020{
1021	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1022		// we only end the transaction if we would really unlock it
1023		// TODO: what about failing transactions that do not unlock?
1024		// (they must make the parent fail, too)
1025		if (owner != NULL) {
1026			status_t status = _TransactionDone(success);
1027			if (status != B_OK)
1028				return status;
1029
1030			// Unlocking the inodes might trigger new transactions, but we
1031			// cannot reuse the current one anymore, as this one is already
1032			// closed.
1033			bool separateSubTransactions = fSeparateSubTransactions;
1034			fSeparateSubTransactions = true;
1035			owner->NotifyListeners(success);
1036			fSeparateSubTransactions = separateSubTransactions;
1037
1038			fOwner = owner->Parent();
1039		} else
1040			fOwner = NULL;
1041
1042		fTimestamp = system_time();
1043
1044		if (fSeparateSubTransactions
1045			&& recursive_lock_get_recursion(&fLock) == 1)
1046			fSeparateSubTransactions = false;
1047	} else
1048		owner->MoveListenersTo(fOwner);
1049
1050	recursive_lock_unlock(&fLock);
1051	return B_OK;
1052}
1053
1054
1055uint32
1056Journal::_TransactionSize() const
1057{
1058	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1059		fTransactionID);
1060	if (count <= 0)
1061		return 0;
1062
1063	// take the number of array blocks in this transaction into account
1064	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1065	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1066	return count + arrayBlocks;
1067}
1068
1069
1070status_t
1071Journal::_TransactionDone(bool success)
1072{
1073	if (!success) {
1074		if (_HasSubTransaction()) {
1075			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1076			// We can continue to use the parent transaction afterwards
1077		} else {
1078			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1079			fUnwrittenTransactions = 0;
1080		}
1081
1082		return B_OK;
1083	}
1084
1085	// Up to a maximum size, we will just batch several
1086	// transactions together to improve speed
1087	uint32 size = _TransactionSize();
1088	if (size < fMaxTransactionSize) {
1089		// Flush the log from time to time, so that we have enough space
1090		// for this transaction
1091		if (size > FreeLogBlocks())
1092			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1093
1094		fUnwrittenTransactions++;
1095		return B_OK;
1096	}
1097
1098	return _WriteTransactionToLog();
1099}
1100
1101
1102//	#pragma mark - debugger commands
1103
1104
1105#ifdef BFS_DEBUGGER_COMMANDS
1106
1107
1108void
1109Journal::Dump()
1110{
1111	kprintf("Journal %p\n", this);
1112	kprintf("  log start:            %" B_PRId32 "\n", fVolume->LogStart());
1113	kprintf("  log end:              %" B_PRId32 "\n", fVolume->LogEnd());
1114	kprintf("  owner:                %p\n", fOwner);
1115	kprintf("  log size:             %" B_PRIu32 "\n", fLogSize);
1116	kprintf("  max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize);
1117	kprintf("  used:                 %" B_PRIu32 "\n", fUsed);
1118	kprintf("  unwritten:            %" B_PRId32 "\n", fUnwrittenTransactions);
1119	kprintf("  timestamp:            %" B_PRId64 "\n", fTimestamp);
1120	kprintf("  transaction ID:       %" B_PRId32 "\n", fTransactionID);
1121	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1122	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1123	kprintf("entries:\n");
1124	kprintf("  address        id  start length\n");
1125
1126	LogEntryList::Iterator iterator = fEntries.GetIterator();
1127
1128	while (iterator.HasNext()) {
1129		LogEntry* entry = iterator.Next();
1130
1131		kprintf("  %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry,
1132			entry->TransactionID(), entry->Start(), entry->Length());
1133	}
1134}
1135
1136
1137int
1138dump_journal(int argc, char** argv)
1139{
1140	if (argc != 2 || !strcmp(argv[1], "--help")) {
1141		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1142		return 0;
1143	}
1144
1145	Volume* volume = (Volume*)parse_expression(argv[1]);
1146	Journal* journal = volume->GetJournal(0);
1147
1148	journal->Dump();
1149	return 0;
1150}
1151
1152
1153#endif	// BFS_DEBUGGER_COMMANDS
1154
1155
1156//	#pragma mark - TransactionListener
1157
1158
1159TransactionListener::TransactionListener()
1160{
1161}
1162
1163
1164TransactionListener::~TransactionListener()
1165{
1166}
1167
1168
1169//	#pragma mark - Transaction
1170
1171
1172status_t
1173Transaction::Start(Volume* volume, off_t refBlock)
1174{
1175	// has it already been started?
1176	if (fJournal != NULL)
1177		return B_OK;
1178
1179	fJournal = volume->GetJournal(refBlock);
1180	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1181		return B_OK;
1182
1183	fJournal = NULL;
1184	return B_ERROR;
1185}
1186
1187
1188void
1189Transaction::AddListener(TransactionListener* listener)
1190{
1191	if (fJournal == NULL)
1192		panic("Transaction is not running!");
1193
1194	fListeners.Add(listener);
1195}
1196
1197
1198void
1199Transaction::RemoveListener(TransactionListener* listener)
1200{
1201	if (fJournal == NULL)
1202		panic("Transaction is not running!");
1203
1204	fListeners.Remove(listener);
1205	listener->RemovedFromTransaction();
1206}
1207
1208
1209void
1210Transaction::NotifyListeners(bool success)
1211{
1212	while (TransactionListener* listener = fListeners.RemoveHead()) {
1213		listener->TransactionDone(success);
1214		listener->RemovedFromTransaction();
1215	}
1216}
1217
1218
1219/*!	Move the inodes into the parent transaction. This is needed only to make
1220	sure they will still be reverted in case the transaction is aborted.
1221*/
1222void
1223Transaction::MoveListenersTo(Transaction* transaction)
1224{
1225	while (TransactionListener* listener = fListeners.RemoveHead()) {
1226		transaction->fListeners.Add(listener);
1227	}
1228}
1229