file_cache.cpp revision a38a92c9
1/*
2 * Copyright 2004-2007, Haiku Inc. All rights reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 * 		Axel D��rfler <axeld@pinc-software.de>
7 * 		Ingo Weinhold <bonefish@cs.tu-berlin.de>
8 */
9
10#include "fssh_fs_cache.h"
11
12#include <new>
13
14#include <stdlib.h>
15
16#include "DoublyLinkedList.h"
17#include "fssh_kernel_export.h"
18#include "fssh_stdio.h"
19#include "fssh_string.h"
20#include "fssh_uio.h"
21#include "fssh_unistd.h"
22#include "hash.h"
23#include "lock.h"
24#include "vfs.h"
25
26
27#undef TRACE
28//#define TRACE_FILE_CACHE
29#ifdef TRACE_FILE_CACHE
30#	define TRACE(x) fssh_dprintf x
31#else
32#	define TRACE(x) ;
33#endif
34
35using std::nothrow;
36
37
38// This is a hacked version of the kernel file cache implementation. The main
39// part of the implementation didn't change that much -- some code not needed
40// in userland has been removed, file_cache_ref, vm_cache_ref and vm_cache
41// have been unified, and the complete interface to the VM (vm_*()) and the
42// VFS (vfs_*()) has been re-implemented to suit our needs.
43//
44// The PagePool class is a data structure used for managing the pages (vm_page)
45// allocated and assigned to caches (file_cache_ref). It has a list for free
46// pages, i.e. those that are not assigned to any cache and can be reused at
47// once. A second list contains those pages that belong to a cache, but are
48// not in use by any of the functions. These pages have a reference count of
49// 0. They will be stolen from the owning cache when a usable page is needed,
50// there are no free pages anymore, and the limit of pages that shall be used
51// at maximum has already been reached.
52//
53// The only purpose of the page reference count (vm_page::ref_count) is to
54// indicate whether the page is in use (i.e. known to one or more of the cache
55// functions currently being executed). vm_page_get_page(),
56// vm_page_allocate_page(), and vm_cache_lookup_page acquire a reference to
57// the page, vm_page_put_page() releases a reference.
58
59// vm_page::state indicates in what state
60// a page currently is. PAGE_STATE_FREE is only encountered for pages not
61// belonging to a cache (being in page pool's free pages list, or just having
62// been removed or not yet added). PAGE_STATE_ACTIVE/MODIFIED indicate that
63// the page contains valid file data, in the latter case not yet written to
64// disk. PAGE_STATE_BUSY means that the page is currently being manipulated by
65// an operation, usually meaning that page data are being read from/written to
66// disk. The required behavior when encountering a busy page, is to put the
67// page, release the page pool and cache locks, wait a short time, and
68// retry again later.
69//
70// Changing the page state requires having a reference to the page,
71// holding the lock of the owning cache (if any) and the lock of the page pool
72// (in case the page is not newly allocated).
73//
74// A page is in up to three lists. The free or unused list in the page pool
75// (guarded by the page pool lock), the pages list of the cache the page
76// belongs to, and the modified pages list of the cache (both cache lists
77// are guarded by both the page pool and the cache lock). The modified pages
78// list only contains PAGE_STATE_MODIFIED or PAGE_STATE_BUSY pages.
79
80
81// maximum number of iovecs per request
82#define MAX_IO_VECS			64	// 256 kB
83#define MAX_FILE_IO_VECS	32
84#define MAX_TEMP_IO_VECS	8
85
86#define CACHED_FILE_EXTENTS	2
87	// must be smaller than MAX_FILE_IO_VECS
88	// ToDo: find out how much of these are typically used
89
90#define user_memcpy(a, b, c) fssh_memcpy(a, b, c)
91
92#define PAGE_ALIGN(x) (((x) + (FSSH_B_PAGE_SIZE - 1)) & ~(FSSH_B_PAGE_SIZE - 1))
93#define ASSERT(x)
94
95namespace FSShell {
96
97enum {
98	PAGE_STATE_ACTIVE = 0,
99	PAGE_STATE_BUSY,
100	PAGE_STATE_MODIFIED,
101	PAGE_STATE_FREE
102};
103
104enum {
105	PHYSICAL_PAGE_NO_WAIT = 0,
106	PHYSICAL_PAGE_CAN_WAIT,
107};
108
109struct vm_page;
110struct file_cache_ref;
111
112typedef DoublyLinkedListLink<vm_page> page_link;
113
114// vm page
115struct vm_page {
116	vm_page*			next;
117	page_link			link_cache;
118	page_link			link_cache_modified;
119	page_link			link_pool;
120	file_cache_ref*		cache;
121	fssh_off_t			offset;
122	void*				data;
123	uint8_t				state;
124	uint32_t			ref_count;
125
126	vm_page()
127		: next(NULL),
128		  cache(NULL),
129		  offset(0),
130		  data(malloc(FSSH_B_PAGE_SIZE)),
131		  state(PAGE_STATE_FREE),
132		  ref_count(1)
133	{
134	}
135
136	~vm_page()
137	{
138		free(data);
139	}
140
141	fssh_addr_t			Address() const	{ return (fssh_addr_t)data; }
142
143	static int Compare(void *_cacheEntry, const void *_key);
144	static uint32_t Hash(void *_cacheEntry, const void *_key, uint32_t range);
145};
146
147struct file_extent {
148	fssh_off_t			offset;
149	fssh_file_io_vec	disk;
150};
151
152struct file_map {
153	file_map();
154	~file_map();
155
156	file_extent *operator[](uint32_t index);
157	file_extent *ExtentAt(uint32_t index);
158	fssh_status_t Add(fssh_file_io_vec *vecs, fssh_size_t vecCount, fssh_off_t &lastOffset);
159	void Free();
160
161	union {
162		file_extent	direct[CACHED_FILE_EXTENTS];
163		file_extent	*array;
164	};
165	fssh_size_t			count;
166};
167
168
169static vm_page *vm_page_allocate_page(int state);
170static void vm_page_get_page(vm_page* page);
171static fssh_status_t vm_page_put_page(vm_page* page);
172static fssh_status_t vm_page_set_state(vm_page *page, int state);
173
174static void vm_cache_insert_page(file_cache_ref *cacheRef, vm_page *page,
175	fssh_off_t offset);
176static void vm_cache_remove_page(file_cache_ref *cacheRef, vm_page *page);
177static vm_page *vm_cache_lookup_page(file_cache_ref *cacheRef, fssh_off_t page);
178static fssh_status_t vm_cache_resize(file_cache_ref *cacheRef, fssh_off_t newSize);
179static fssh_status_t vm_cache_write_modified(file_cache_ref *ref, bool fsReenter);
180
181static fssh_status_t vfs_read_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs,
182	fssh_size_t count, fssh_size_t *_numBytes, bool fsReenter);
183static fssh_status_t vfs_write_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs,
184	fssh_size_t count, fssh_size_t *_numBytes, bool fsReenter);
185
186static fssh_status_t pages_io(file_cache_ref *ref, fssh_off_t offset, const fssh_iovec *vecs,
187	fssh_size_t count, fssh_size_t *_numBytes, bool doWrite);
188
189
190typedef DoublyLinkedList<vm_page,
191	DoublyLinkedListMemberGetLink<vm_page,
192		&vm_page::link_cache_modified> > cache_modified_page_list;
193
194typedef DoublyLinkedList<vm_page,
195	DoublyLinkedListMemberGetLink<vm_page,
196		&vm_page::link_cache> > cache_page_list;
197
198struct file_cache_ref {
199	mutex						lock;
200	fssh_mount_id				mountID;
201	fssh_vnode_id				nodeID;
202	void*						node;
203	int							deviceFD;
204	fssh_off_t					virtual_size;
205
206	cache_page_list				pages;
207	cache_modified_page_list	modifiedPages;
208
209	file_map					map;
210};
211
212struct page_hash_key {
213	page_hash_key() {}
214	page_hash_key(int fd, fssh_vnode_id id, fssh_off_t offset)
215		: deviceFD(fd),
216		  nodeID(id),
217		  offset(offset)
218	{
219	}
220
221	int				deviceFD;
222	fssh_vnode_id	nodeID;
223	fssh_off_t		offset;
224};
225
226
227typedef DoublyLinkedList<vm_page,
228	DoublyLinkedListMemberGetLink<vm_page,
229		&vm_page::link_pool> > pool_page_list;
230
231struct PagePool {
232
233	PagePool()
234		: pageHash(NULL),
235		  unusedPages(),
236		  freePages(),
237		  allocatedPages(0)
238	{
239	}
240
241	~PagePool()
242	{
243	}
244
245	fssh_status_t Init()
246	{
247		fssh_status_t error = recursive_lock_init(&lock, "page pool");
248		if (error != FSSH_B_OK) {
249			fssh_panic("PagePool: Failed to init lock\n");
250			return error;
251		}
252
253		pageHash = hash_init(256, 0, &vm_page::Compare, &vm_page::Hash);
254		if (pageHash == NULL) {
255			fssh_panic("PagePool: Failed to create page hash\n");
256			return FSSH_B_NO_MEMORY;
257		}
258
259		return FSSH_B_OK;
260	}
261
262	bool Lock() { return (recursive_lock_lock(&lock) == FSSH_B_OK); }
263	void Unlock() { recursive_lock_unlock(&lock); }
264
265	recursive_lock	lock;
266	hash_table*		pageHash;
267	pool_page_list	unusedPages;
268	pool_page_list	freePages;
269	uint32_t		allocatedPages;
270};
271
272struct PagePutter {
273	PagePutter(vm_page* page)
274		: fPage(page)
275	{
276	}
277
278	~PagePutter()
279	{
280		if (fPage)
281			vm_page_put_page(fPage);
282	}
283
284private:
285	vm_page*	fPage;
286};
287
288static PagePool sPagePool;
289static const uint32_t kMaxAllocatedPages = 1024;
290
291
292int
293vm_page::Compare(void *_cacheEntry, const void *_key)
294{
295	vm_page *page = (vm_page*)_cacheEntry;
296	const page_hash_key *key = (const page_hash_key *)_key;
297
298	// device FD
299	if (page->cache->deviceFD != key->deviceFD)
300		return page->cache->deviceFD - key->deviceFD;
301
302	// node ID
303	if (page->cache->nodeID < key->nodeID)
304		return -1;
305	if (page->cache->nodeID > key->nodeID)
306		return 1;
307
308	// offset
309	if (page->offset < key->offset)
310		return -1;
311	if (page->offset > key->offset)
312		return 1;
313
314	return 0;
315}
316
317uint32_t
318vm_page::Hash(void *_cacheEntry, const void *_key, uint32_t range)
319{
320	vm_page *page = (vm_page*)_cacheEntry;
321	const page_hash_key *key = (const page_hash_key *)_key;
322
323	int fd = (page ? page->cache->deviceFD : key->deviceFD);
324	fssh_vnode_id id = (page ? page->cache->nodeID : key->nodeID);
325	fssh_off_t offset = (page ? page->offset : key->offset);
326
327	uint32_t value = fd;
328	value = value * 17 + id;
329	value = value * 17 + offset;
330
331	return value % range;
332}
333
334
335vm_page *
336vm_page_allocate_page(int state)
337{
338	AutoLocker<PagePool> poolLocker(sPagePool);
339
340	// is a queued free page available?
341	vm_page* page = sPagePool.freePages.RemoveHead();
342	if (page) {
343		page->ref_count++;
344		return page;
345	}
346
347	// no free page
348
349#if 0
350// TODO: Nice idea in principle, but causes a serious locking problem.
351// vm_page_allocate_page() is always invoked with some cached locked at the
352// same time. Thus locking a cache here to steal a page can cause a deadlock.
353	// If the limit for allocated pages has been reached, we try to steal an
354	// unused page.
355	if (sPagePool.allocatedPages >= kMaxAllocatedPages
356		&& !sPagePool.unusedPages.IsEmpty()) {
357		// we grab the first non-busy page
358		for (pool_page_list::Iterator it(&sPagePool.unusedPages);
359			vm_page* currentPage = it.Next();) {
360			if (currentPage->state != PAGE_STATE_BUSY) {
361				it.Remove();
362				page = currentPage;
363				page->ref_count++;
364				break;
365			}
366		}
367
368		// If we've found a suitable page, we need to mark it busy, write it
369		// if it was modified, and finally remove it from its cache.
370		if (page != NULL) {
371			bool modified = (page->state == PAGE_STATE_MODIFIED);
372
373			// mark the page busy
374			page->state = PAGE_STATE_BUSY;
375
376			// We don't need the pool lock anymore.
377			poolLocker.Unlock();
378
379			file_cache_ref* cache = page->cache;
380
381			// If the page is modified, we write it to the disk.
382			if (modified) {
383				// find out, how much to write, and remove the page from the
384				// cache's modified pages list
385				MutexLocker cacheLocker(cache->lock);
386				fssh_size_t bytes = fssh_min_c(FSSH_B_PAGE_SIZE,
387					cache->virtual_size - page->offset);
388				cache->modifiedPages.Remove(page);
389				cacheLocker.Unlock();
390
391				// if we need to write anything, do it now
392				if (bytes > 0) {
393					fssh_iovec vecs[1];
394					vecs[0].iov_base = page->data;
395					vecs[0].iov_len = bytes;
396					fssh_status_t error = pages_io(cache, page->offset, vecs, 1,
397						&bytes, true);
398					if (error != FSSH_B_OK) {
399						// failed to write the page: bad, but we can't do
400						// much about it
401						fssh_dprintf("vm_page_allocate_page(): Failed to write "
402							"page %p (cache %p, offset: %lld).\n", page,
403							cache, page->offset);
404					}
405				}
406			}
407
408			// remove the page from the cache
409			MutexLocker cacheLocker(cache->lock);
410			vm_cache_remove_page(cache, page);
411
412			// now it's ours
413			page->state = PAGE_STATE_FREE;
414
415			return page;
416		}
417	}
418#endif	// 0
419
420	// no page yet -- allocate a new one
421
422	page = new(nothrow) vm_page;
423	if (!page || !page->data) {
424		delete page;
425		return NULL;
426	}
427
428	sPagePool.allocatedPages++;
429
430	return page;
431}
432
433
434static void
435vm_page_get_page(vm_page* page)
436{
437	if (page) {
438		AutoLocker<PagePool> _(sPagePool);
439
440		// increase ref count
441		page->ref_count++;
442
443		// if the page was unused before, remove it from the unused pages list
444		if (page->ref_count == 1)
445			sPagePool.unusedPages.Remove(page);
446	}
447}
448
449
450static fssh_status_t
451vm_page_put_page(vm_page* page)
452{
453	if (!page)
454		return FSSH_B_BAD_VALUE;
455
456	AutoLocker<PagePool> _(sPagePool);
457
458	if (page->ref_count <= 0) {
459		fssh_panic("vm_page_put_page(): page %p already unreferenced!\n", page);
460		return FSSH_B_BAD_VALUE;
461	}
462
463	// decrease ref count
464	page->ref_count--;
465
466	if (page->ref_count > 0)
467		return FSSH_B_OK;
468
469	// the page is unreference now: add it to the unused or free pages list
470
471	if (page->state == PAGE_STATE_FREE) {
472		// page is free
473		// if we've maxed out the allowed allocated page, free this one,
474		// otherwise add it to the free list
475		if (sPagePool.allocatedPages > kMaxAllocatedPages) {
476			delete page;
477			sPagePool.allocatedPages--;
478		} else
479			sPagePool.freePages.Add(page);
480	} else {
481		// page is is not free; add to unused pages list
482		sPagePool.unusedPages.Add(page);
483	}
484
485	return FSSH_B_OK;
486}
487
488
489fssh_status_t
490vm_page_set_state(vm_page *page, int state)
491{
492	AutoLocker<PagePool> _(sPagePool);
493
494	if (page->ref_count <= 0) {
495		fssh_panic("vm_page_set_state(): page %p is already unreferenced!\n",
496			page);
497		return FSSH_B_BAD_VALUE;
498	}
499
500	if (state == page->state)
501		return FSSH_B_OK;
502
503	// If it was modified before, remove the page from the cache's modified
504	// pages list.
505	if (page->state == PAGE_STATE_MODIFIED && page->cache)
506		page->cache->modifiedPages.Remove(page);
507
508	switch (state) {
509		case PAGE_STATE_ACTIVE:
510		case PAGE_STATE_BUSY:
511		case PAGE_STATE_FREE:
512			page->state = state;
513			break;
514
515		case PAGE_STATE_MODIFIED:
516		{
517			page->state = state;
518
519			// add page to the modified list of the cache
520			if (!page->cache) {
521				fssh_panic("vm_page_set_state(): setting page state of page %p "
522					"to PAGE_STATE_MODIFIED, but page is not in a cache\n",
523					page);
524				return FSSH_B_BAD_VALUE;
525			}
526
527			page->cache->modifiedPages.Add(page);
528
529			break;
530		}
531
532		default:
533			fssh_panic("vm_page_set_state(): invalid page state: %d\n", state);
534			return FSSH_B_BAD_VALUE;
535	}
536
537	return FSSH_B_OK;
538}
539
540
541// cache must be locked
542//
543void
544vm_cache_insert_page(file_cache_ref *cache, vm_page *page, fssh_off_t offset)
545{
546	AutoLocker<PagePool> _(sPagePool);
547
548	if (page->cache != NULL) {
549		fssh_panic("vm_cache_insert_page(%p, %p): page already in cache %p\n",
550			cache, page, page->cache);
551		return;
552	}
553
554	page->cache = cache;
555	page->offset = offset;
556
557	// insert page into hash
558	fssh_status_t error = hash_insert(sPagePool.pageHash, page);
559	if (error != FSSH_B_OK) {
560		fssh_panic("vm_cache_insert_page(): Failed to insert page %p into hash!\n",
561			page);
562		page->cache = NULL;
563		page->offset = 0;
564		return;
565	}
566
567	// add page to cache page list
568	cache->pages.Add(page);
569}
570
571
572// cache must be locked
573//
574void
575vm_cache_remove_page(file_cache_ref *cache, vm_page *page)
576{
577	if (cache != page->cache) {
578		fssh_panic("vm_cache_remove_page(%p, %p): page is in cache %p\n",
579			cache, page, page->cache);
580		return;
581	}
582
583	AutoLocker<PagePool> _(sPagePool);
584
585	if (page->state == PAGE_STATE_MODIFIED)
586		cache->modifiedPages.Remove(page);
587
588	cache->pages.Remove(page);
589
590	hash_remove(sPagePool.pageHash, page);
591
592	page->cache = NULL;
593	page->offset = 0;
594}
595
596
597vm_page *
598vm_cache_lookup_page(file_cache_ref *cache, fssh_off_t offset)
599{
600	if (!cache)
601		return NULL;
602
603	AutoLocker<PagePool> _(sPagePool);
604
605	page_hash_key key(cache->deviceFD, cache->nodeID, offset);
606
607	vm_page* page = (vm_page*)hash_lookup(sPagePool.pageHash, &key);
608
609	if (page)
610		vm_page_get_page(page);
611
612	return page;
613}
614
615
616// cache must be locked
617//
618fssh_status_t
619vm_cache_resize(file_cache_ref *cache, fssh_off_t newSize)
620{
621	fssh_off_t oldAlignedSize = PAGE_ALIGN(cache->virtual_size);
622	fssh_off_t newAlignedSize = PAGE_ALIGN(newSize);
623
624	cache->virtual_size = newSize;
625
626	// if the aligned cache size increased or remained the same, we're done
627	if (newAlignedSize >= oldAlignedSize)
628		return FSSH_B_OK;
629
630	// the file shrinks, so we need to get rid of excess pages
631
632	// Hold the page pool lock virtually all the time from now on.
633	AutoLocker<PagePool> poolLocker(sPagePool);
634
635	// For sake of efficiency we sort the cache's list of pages so that all
636	// pages that need to be removed are at the beginning of the list.
637	vm_page* page = cache->pages.Head();
638	if (newAlignedSize > 0) {
639		while (page) {
640			vm_page* nextPage = cache->pages.GetNext(page);
641
642			if (page->offset >= newAlignedSize) {
643				// move to the beginning of the list
644				cache->pages.Remove(page);
645				cache->pages.Add(page, false);
646			}
647
648			page = nextPage;
649		}
650	}
651
652	// now we remove and free the excess pages one by one
653	while (true) {
654		// get the first page in the list to remove
655		// (since we sorted the list, this is usually very cheap)
656		for (cache_page_list::Iterator it(&cache->pages); (page = it.Next());) {
657			if (page->offset >= newAlignedSize)
658				break;
659		}
660
661		// no more pages? -- then we're done
662		if (!page)
663			return FSSH_B_OK;
664
665		if (page->state == PAGE_STATE_BUSY) {
666			// the page is busy -- wait a while and try again
667			poolLocker.Unlock();
668			mutex_unlock(&cache->lock);
669			fssh_snooze(20000);
670			mutex_lock(&cache->lock);
671			poolLocker.Lock();
672		} else {
673			// the page isn't busy -- get rid of it
674			vm_page_get_page(page);
675			vm_cache_remove_page(cache, page);
676			vm_page_set_state(page, PAGE_STATE_FREE);
677			vm_page_put_page(page);
678		}
679	}
680
681	return FSSH_B_OK;
682}
683
684
685fssh_status_t
686vm_cache_write_modified(file_cache_ref *cache, bool fsReenter)
687{
688	// TODO: Write more than one page at a time. To avoid deadlocks, when a
689	// busy page is encountered the previously collected pages need to be
690	// written. Otherwise as many pages as our on-stack array can contain
691	// can be processed at once.
692	MutexLocker locker(cache->lock);
693	while (true) {
694		// get the next modified page and mark it busy
695		vm_page* page = NULL;
696
697		while (true) {
698			// get the first modified page
699			AutoLocker<PagePool> poolLocker(sPagePool);
700			page = cache->modifiedPages.Head();
701
702			if (!page)
703				return FSSH_B_OK;
704
705			// if not marked busy, remove it and mark it busy
706			if (page->state != PAGE_STATE_BUSY) {
707				cache->modifiedPages.Remove(page);
708				vm_page_get_page(page);
709				page->state = PAGE_STATE_BUSY;
710				break;
711			}
712
713			// page is busy -- wait a while
714			vm_page_put_page(page);
715			poolLocker.Unlock();
716			locker.Unlock();
717			fssh_snooze(20000);
718			locker.Lock();
719		}
720
721		locker.Unlock();
722
723		// write the page
724		fssh_size_t bytes = fssh_min_c(FSSH_B_PAGE_SIZE, cache->virtual_size - page->offset);
725		fssh_iovec vecs[1];
726		vecs[0].iov_base = page->data;
727		vecs[0].iov_len = bytes;
728		fssh_status_t error = pages_io(cache, page->offset, vecs, 1, &bytes, true);
729		if (error != FSSH_B_OK)
730			return error;
731
732		locker.Lock();
733
734		vm_page_set_state(page, PAGE_STATE_ACTIVE);
735		vm_page_put_page(page);
736	}
737
738	return FSSH_B_OK;
739}
740
741
742fssh_status_t
743vfs_read_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs, fssh_size_t count,
744	fssh_size_t *_numBytes, bool fsReenter)
745{
746	// check how much the iovecs allow us to read
747	fssh_size_t toRead = 0;
748	for (fssh_size_t i = 0; i < count; i++)
749		toRead += vecs[i].iov_len;
750
751	fssh_iovec* newVecs = NULL;
752	if (*_numBytes < toRead) {
753		// We're supposed to read less than specified by the vecs. Since
754		// readv_pos() doesn't support this, we need to clone the vecs.
755		newVecs = new(nothrow) fssh_iovec[count];
756		if (!newVecs)
757			return FSSH_B_NO_MEMORY;
758
759		fssh_size_t newCount = 0;
760		for (fssh_size_t i = 0; i < count && toRead > 0; i++) {
761			fssh_size_t vecLen = fssh_min_c(vecs[i].iov_len, toRead);
762			newVecs[i].iov_base = vecs[i].iov_base;
763			newVecs[i].iov_len = vecLen;
764			toRead -= vecLen;
765			newCount++;
766		}
767
768		vecs = newVecs;
769		count = newCount;
770	}
771
772	fssh_ssize_t bytesRead = fssh_readv_pos(fd, pos, vecs, count);
773	delete[] newVecs;
774	if (bytesRead < 0)
775		return bytesRead;
776
777	*_numBytes = bytesRead;
778	return FSSH_B_OK;
779}
780
781
782fssh_status_t
783vfs_write_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs, fssh_size_t count,
784	fssh_size_t *_numBytes, bool fsReenter)
785{
786	// check how much the iovecs allow us to write
787	fssh_size_t toWrite = 0;
788	for (fssh_size_t i = 0; i < count; i++)
789		toWrite += vecs[i].iov_len;
790
791	fssh_iovec* newVecs = NULL;
792	if (*_numBytes < toWrite) {
793		// We're supposed to write less than specified by the vecs. Since
794		// writev_pos() doesn't support this, we need to clone the vecs.
795		newVecs = new(nothrow) fssh_iovec[count];
796		if (!newVecs)
797			return FSSH_B_NO_MEMORY;
798
799		fssh_size_t newCount = 0;
800		for (fssh_size_t i = 0; i < count && toWrite > 0; i++) {
801			fssh_size_t vecLen = fssh_min_c(vecs[i].iov_len, toWrite);
802			newVecs[i].iov_base = vecs[i].iov_base;
803			newVecs[i].iov_len = vecLen;
804			toWrite -= vecLen;
805			newCount++;
806		}
807
808		vecs = newVecs;
809		count = newCount;
810	}
811
812	fssh_ssize_t bytesWritten = fssh_writev_pos(fd, pos, vecs, count);
813	delete[] newVecs;
814	if (bytesWritten < 0)
815		return bytesWritten;
816
817	*_numBytes = bytesWritten;
818	return FSSH_B_OK;
819}
820
821
822fssh_status_t
823file_cache_init()
824{
825	fssh_status_t error = sPagePool.Init();
826	if (error != FSSH_B_OK)
827		return error;
828
829	return FSSH_B_OK;
830}
831
832
833// #pragma mark -
834
835
836file_map::file_map()
837{
838	array = NULL;
839	count = 0;
840}
841
842
843file_map::~file_map()
844{
845	Free();
846}
847
848
849file_extent *
850file_map::operator[](uint32_t index)
851{
852	return ExtentAt(index);
853}
854
855
856file_extent *
857file_map::ExtentAt(uint32_t index)
858{
859	if (index >= count)
860		return NULL;
861
862	if (count > CACHED_FILE_EXTENTS)
863		return &array[index];
864
865	return &direct[index];
866}
867
868
869fssh_status_t
870file_map::Add(fssh_file_io_vec *vecs, fssh_size_t vecCount, fssh_off_t &lastOffset)
871{
872	TRACE(("file_map::Add(vecCount = %ld)\n", vecCount));
873
874	fssh_off_t offset = 0;
875
876	if (vecCount <= CACHED_FILE_EXTENTS && count == 0) {
877		// just use the reserved area in the file_cache_ref structure
878	} else {
879		// TODO: once we can invalidate only parts of the file map,
880		//	we might need to copy the previously cached file extends
881		//	from the direct range
882		file_extent *newMap = (file_extent *)realloc(array,
883			(count + vecCount) * sizeof(file_extent));
884		if (newMap == NULL)
885			return FSSH_B_NO_MEMORY;
886
887		array = newMap;
888
889		if (count != 0) {
890			file_extent *extent = ExtentAt(count - 1);
891			offset = extent->offset + extent->disk.length;
892		}
893	}
894
895	int32_t start = count;
896	count += vecCount;
897
898	for (uint32_t i = 0; i < vecCount; i++) {
899		file_extent *extent = ExtentAt(start + i);
900
901		extent->offset = offset;
902		extent->disk = vecs[i];
903
904		offset += extent->disk.length;
905	}
906
907#ifdef TRACE_FILE_CACHE
908	for (uint32_t i = 0; i < count; i++) {
909		file_extent *extent = ExtentAt(i);
910		fssh_dprintf("[%ld] extend offset %Ld, disk offset %Ld, length %Ld\n",
911			i, extent->offset, extent->disk.offset, extent->disk.length);
912	}
913#endif
914
915	lastOffset = offset;
916	return FSSH_B_OK;
917}
918
919
920void
921file_map::Free()
922{
923	if (count > CACHED_FILE_EXTENTS)
924		free(array);
925
926	array = NULL;
927	count = 0;
928}
929
930
931//	#pragma mark -
932
933
934static void
935add_to_iovec(fssh_iovec *vecs, int32_t &index, int32_t max, fssh_addr_t address, fssh_size_t size)
936{
937	if (index > 0 && (fssh_addr_t)vecs[index - 1].iov_base + vecs[index - 1].iov_len == address) {
938		// the iovec can be combined with the previous one
939		vecs[index - 1].iov_len += size;
940		return;
941	}
942
943	if (index == max)
944		fssh_panic("no more space for iovecs!");
945
946	// we need to start a new iovec
947	vecs[index].iov_base = (void *)address;
948	vecs[index].iov_len = size;
949	index++;
950}
951
952
953static file_extent *
954find_file_extent(file_cache_ref *ref, fssh_off_t offset, uint32_t *_index)
955{
956	// TODO: do binary search
957
958	for (uint32_t index = 0; index < ref->map.count; index++) {
959		file_extent *extent = ref->map[index];
960
961		if (extent->offset <= offset
962			&& extent->offset + extent->disk.length > offset) {
963			if (_index)
964				*_index = index;
965			return extent;
966		}
967	}
968
969	return NULL;
970}
971
972
973static fssh_status_t
974get_file_map(file_cache_ref *ref, fssh_off_t offset, fssh_size_t size,
975	fssh_file_io_vec *vecs, fssh_size_t *_count)
976{
977	fssh_size_t maxVecs = *_count;
978	fssh_status_t status = FSSH_B_OK;
979
980	if (ref->map.count == 0) {
981		// we don't yet have the map of this file, so let's grab it
982		// (ordered by offset, so that we can do a binary search on them)
983
984		mutex_lock(&ref->lock);
985
986		// the file map could have been requested in the mean time
987		if (ref->map.count == 0) {
988			fssh_size_t vecCount = maxVecs;
989			fssh_off_t mapOffset = 0;
990
991			while (true) {
992				status = vfs_get_file_map(ref->node, mapOffset, ~0UL, vecs,
993					&vecCount);
994				if (status < FSSH_B_OK && status != FSSH_B_BUFFER_OVERFLOW) {
995					mutex_unlock(&ref->lock);
996					return status;
997				}
998
999				fssh_status_t addStatus = ref->map.Add(vecs, vecCount, mapOffset);
1000				if (addStatus != FSSH_B_OK) {
1001					// only clobber the status in case of failure
1002					status = addStatus;
1003				}
1004
1005				if (status != FSSH_B_BUFFER_OVERFLOW)
1006					break;
1007
1008				// when we are here, the map has been stored in the array, and
1009				// the array size was still too small to cover the whole file
1010				vecCount = maxVecs;
1011			}
1012		}
1013
1014		mutex_unlock(&ref->lock);
1015	}
1016
1017	if (status != FSSH_B_OK) {
1018		// We must invalidate the (part of the) map we already
1019		// have, as we cannot know if it's complete or not
1020		ref->map.Free();
1021		return status;
1022	}
1023
1024	// We now have cached the map of this file, we now need to
1025	// translate it for the requested access.
1026
1027	uint32_t index;
1028	file_extent *fileExtent = find_file_extent(ref, offset, &index);
1029	if (fileExtent == NULL) {
1030		// access outside file bounds? But that's not our problem
1031		*_count = 0;
1032		return FSSH_B_OK;
1033	}
1034
1035	offset -= fileExtent->offset;
1036	vecs[0].offset = fileExtent->disk.offset + offset;
1037	vecs[0].length = fileExtent->disk.length - offset;
1038
1039	if (vecs[0].length >= size || index >= ref->map.count - 1) {
1040		*_count = 1;
1041		return FSSH_B_OK;
1042	}
1043
1044	// copy the rest of the vecs
1045
1046	size -= vecs[0].length;
1047
1048	for (index = 1; index < ref->map.count;) {
1049		fileExtent++;
1050
1051		vecs[index] = fileExtent->disk;
1052		index++;
1053
1054		if (size <= fileExtent->disk.length)
1055			break;
1056
1057		if (index >= maxVecs) {
1058			*_count = index;
1059			return FSSH_B_BUFFER_OVERFLOW;
1060		}
1061
1062		size -= fileExtent->disk.length;
1063	}
1064
1065	*_count = index;
1066	return FSSH_B_OK;
1067}
1068
1069
1070/*!
1071	Does the dirty work of translating the request into actual disk offsets
1072	and reads to or writes from the supplied iovecs as specified by \a doWrite.
1073*/
1074static fssh_status_t
1075pages_io(file_cache_ref *ref, fssh_off_t offset, const fssh_iovec *vecs, fssh_size_t count,
1076	fssh_size_t *_numBytes, bool doWrite)
1077{
1078	TRACE(("pages_io: ref = %p, offset = %Ld, size = %lu, vecCount = %lu, %s\n",
1079		ref, offset, *_numBytes, count, doWrite ? "write" : "read"));
1080
1081	// translate the iovecs into direct device accesses
1082	fssh_file_io_vec fileVecs[MAX_FILE_IO_VECS];
1083	fssh_size_t fileVecCount = MAX_FILE_IO_VECS;
1084	fssh_size_t numBytes = *_numBytes;
1085
1086	fssh_status_t status = get_file_map(ref, offset, numBytes, fileVecs,
1087		&fileVecCount);
1088	if (status < FSSH_B_OK && status != FSSH_B_BUFFER_OVERFLOW) {
1089		TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n",
1090			offset, numBytes, fssh_strerror(status)));
1091		return status;
1092	}
1093
1094	bool bufferOverflow = status == FSSH_B_BUFFER_OVERFLOW;
1095
1096#ifdef TRACE_FILE_CACHE
1097	fssh_dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount, offset,
1098		numBytes, bufferOverflow ? " (array too small)" : "");
1099	for (fssh_size_t i = 0; i < fileVecCount; i++) {
1100		fssh_dprintf("  [%lu] offset = %Ld, size = %Ld\n",
1101			i, fileVecs[i].offset, fileVecs[i].length);
1102	}
1103#endif
1104
1105	if (fileVecCount == 0) {
1106		// There are no file vecs at this offset, so we're obviously trying
1107		// to access the file outside of its bounds
1108		TRACE(("pages_io: access outside of vnode %p at offset %Ld\n",
1109			ref->vnode, offset));
1110		return FSSH_B_BAD_VALUE;
1111	}
1112
1113	uint32_t fileVecIndex;
1114	fssh_size_t size;
1115
1116	if (!doWrite) {
1117		// now directly read the data from the device
1118		// the first file_io_vec can be read directly
1119
1120		size = fileVecs[0].length;
1121		if (size > numBytes)
1122			size = numBytes;
1123
1124		status = vfs_read_pages(ref->deviceFD, fileVecs[0].offset, vecs,
1125			count, &size, false);
1126		if (status < FSSH_B_OK)
1127			return status;
1128
1129		// TODO: this is a work-around for buggy device drivers!
1130		//	When our own drivers honour the length, we can:
1131		//	a) also use this direct I/O for writes (otherwise, it would
1132		//	   overwrite precious data)
1133		//	b) panic if the term below is true (at least for writes)
1134		if (size > fileVecs[0].length) {
1135			//fssh_dprintf("warning: device driver %p doesn't respect total length in read_pages() call!\n", ref->device);
1136			size = fileVecs[0].length;
1137		}
1138
1139		ASSERT(size <= fileVecs[0].length);
1140
1141		// If the file portion was contiguous, we're already done now
1142		if (size == numBytes)
1143			return FSSH_B_OK;
1144
1145		// if we reached the end of the file, we can return as well
1146		if (size != fileVecs[0].length) {
1147			*_numBytes = size;
1148			return FSSH_B_OK;
1149		}
1150
1151		fileVecIndex = 1;
1152	} else {
1153		fileVecIndex = 0;
1154		size = 0;
1155	}
1156
1157	// Too bad, let's process the rest of the file_io_vecs
1158
1159	fssh_size_t totalSize = size;
1160
1161	// first, find out where we have to continue in our iovecs
1162	uint32_t i = 0;
1163	for (; i < count; i++) {
1164		if (size < vecs[i].iov_len)
1165			break;
1166
1167		size -= vecs[i].iov_len;
1168	}
1169
1170	fssh_size_t vecOffset = size;
1171	fssh_size_t bytesLeft = numBytes - size;
1172
1173	while (true) {
1174		for (; fileVecIndex < fileVecCount; fileVecIndex++) {
1175			fssh_file_io_vec &fileVec = fileVecs[fileVecIndex];
1176			fssh_off_t fileOffset = fileVec.offset;
1177			fssh_off_t fileLeft = fssh_min_c(fileVec.length, bytesLeft);
1178
1179			TRACE(("FILE VEC [%lu] length %Ld\n", fileVecIndex, fileLeft));
1180
1181			// process the complete fileVec
1182			while (fileLeft > 0) {
1183				fssh_iovec tempVecs[MAX_TEMP_IO_VECS];
1184				uint32_t tempCount = 0;
1185
1186				// size tracks how much of what is left of the current fileVec
1187				// (fileLeft) has been assigned to tempVecs
1188				size = 0;
1189
1190				// assign what is left of the current fileVec to the tempVecs
1191				for (size = 0; size < fileLeft && i < count
1192						&& tempCount < MAX_TEMP_IO_VECS;) {
1193					// try to satisfy one iovec per iteration (or as much as
1194					// possible)
1195
1196					// bytes left of the current iovec
1197					fssh_size_t vecLeft = vecs[i].iov_len - vecOffset;
1198					if (vecLeft == 0) {
1199						vecOffset = 0;
1200						i++;
1201						continue;
1202					}
1203
1204					TRACE(("fill vec %ld, offset = %lu, size = %lu\n",
1205						i, vecOffset, size));
1206
1207					// actually available bytes
1208					fssh_size_t tempVecSize = fssh_min_c(vecLeft, fileLeft - size);
1209
1210					tempVecs[tempCount].iov_base
1211						= (void *)((fssh_addr_t)vecs[i].iov_base + vecOffset);
1212					tempVecs[tempCount].iov_len = tempVecSize;
1213					tempCount++;
1214
1215					size += tempVecSize;
1216					vecOffset += tempVecSize;
1217				}
1218
1219				fssh_size_t bytes = size;
1220				if (doWrite) {
1221					status = vfs_write_pages(ref->deviceFD, fileOffset,
1222						tempVecs, tempCount, &bytes, false);
1223				} else {
1224					status = vfs_read_pages(ref->deviceFD, fileOffset,
1225						tempVecs, tempCount, &bytes, false);
1226				}
1227				if (status < FSSH_B_OK)
1228					return status;
1229
1230				totalSize += bytes;
1231				bytesLeft -= size;
1232				fileOffset += size;
1233				fileLeft -= size;
1234				//fssh_dprintf("-> file left = %Lu\n", fileLeft);
1235
1236				if (size != bytes || i >= count) {
1237					// there are no more bytes or iovecs, let's bail out
1238					*_numBytes = totalSize;
1239					return FSSH_B_OK;
1240				}
1241			}
1242		}
1243
1244		if (bufferOverflow) {
1245			status = get_file_map(ref, offset + totalSize, bytesLeft, fileVecs,
1246				&fileVecCount);
1247			if (status < FSSH_B_OK && status != FSSH_B_BUFFER_OVERFLOW) {
1248				TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n",
1249					offset, numBytes, fssh_strerror(status)));
1250				return status;
1251			}
1252
1253			bufferOverflow = status == FSSH_B_BUFFER_OVERFLOW;
1254			fileVecIndex = 0;
1255
1256#ifdef TRACE_FILE_CACHE
1257			fssh_dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount,
1258				offset + totalSize, numBytes,
1259				bufferOverflow ? " (array too small)" : "");
1260			for (fssh_size_t i = 0; i < fileVecCount; i++) {
1261				fssh_dprintf("  [%lu] offset = %Ld, size = %Ld\n",
1262					i, fileVecs[i].offset, fileVecs[i].length);
1263			}
1264#endif
1265		} else
1266			break;
1267	}
1268
1269	*_numBytes = totalSize;
1270	return FSSH_B_OK;
1271}
1272
1273
1274/*!
1275	This function is called by read_into_cache() (and from there only) - it
1276	can only handle a certain amount of bytes, and read_into_cache() makes
1277	sure that it matches that criterion.
1278*/
1279static inline fssh_status_t
1280read_chunk_into_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t numBytes,
1281	int32_t pageOffset, fssh_addr_t buffer, fssh_size_t bufferSize)
1282{
1283	TRACE(("read_chunk(offset = %Ld, size = %lu, pageOffset = %ld, buffer = %#lx, bufferSize = %lu\n",
1284		offset, size, pageOffset, buffer, bufferSize));
1285
1286	fssh_iovec vecs[MAX_IO_VECS];
1287	int32_t vecCount = 0;
1288
1289	vm_page *pages[MAX_IO_VECS];
1290	int32_t pageIndex = 0;
1291
1292	// allocate pages for the cache and mark them busy
1293	for (fssh_size_t pos = 0; pos < numBytes; pos += FSSH_B_PAGE_SIZE) {
1294		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
1295		if (page == NULL)
1296			fssh_panic("no more pages!");
1297
1298		page->state = PAGE_STATE_BUSY;
1299
1300		vm_cache_insert_page(ref, page, offset + pos);
1301
1302		fssh_addr_t virtualAddress = page->Address();
1303
1304		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, FSSH_B_PAGE_SIZE);
1305		// ToDo: check if the array is large enough!
1306	}
1307
1308	mutex_unlock(&ref->lock);
1309
1310	// read file into reserved pages
1311	fssh_status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes, false);
1312	if (status < FSSH_B_OK) {
1313		// reading failed, free allocated pages
1314
1315		fssh_dprintf("file_cache: read pages failed: %s\n", fssh_strerror(status));
1316
1317		mutex_lock(&ref->lock);
1318
1319		for (int32_t i = 0; i < pageIndex; i++) {
1320			vm_cache_remove_page(ref, pages[i]);
1321			vm_page_set_state(pages[i], PAGE_STATE_FREE);
1322			vm_page_put_page(pages[i]);
1323		}
1324
1325		return status;
1326	}
1327
1328	// copy the pages and unmap them again
1329
1330	for (int32_t i = 0; i < vecCount; i++) {
1331		fssh_addr_t base = (fssh_addr_t)vecs[i].iov_base;
1332		fssh_size_t size = vecs[i].iov_len;
1333
1334		// copy to user buffer if necessary
1335		if (bufferSize != 0) {
1336			fssh_size_t bytes = fssh_min_c(bufferSize, size - pageOffset);
1337
1338			user_memcpy((void *)buffer, (void *)(base + pageOffset), bytes);
1339			buffer += bytes;
1340			bufferSize -= bytes;
1341			pageOffset = 0;
1342		}
1343	}
1344
1345	mutex_lock(&ref->lock);
1346
1347	// make the pages accessible in the cache
1348	for (int32_t i = pageIndex; i-- > 0;) {
1349		vm_page_set_state(pages[i], PAGE_STATE_ACTIVE);
1350		vm_page_put_page(pages[i]);
1351	}
1352
1353	return FSSH_B_OK;
1354}
1355
1356
1357/*!
1358	This function reads \a size bytes directly from the file into the cache.
1359	If \a bufferSize does not equal zero, \a bufferSize bytes from the data
1360	read in are also copied to the provided \a buffer.
1361	This function always allocates all pages; it is the responsibility of the
1362	calling function to only ask for yet uncached ranges.
1363	The cache_ref lock must be hold when calling this function.
1364*/
1365static fssh_status_t
1366read_into_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t size, fssh_addr_t buffer, fssh_size_t bufferSize)
1367{
1368	TRACE(("read_from_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
1369		ref, offset, size, (void *)buffer, bufferSize));
1370
1371	// do we have to read in anything at all?
1372	if (size == 0)
1373		return FSSH_B_OK;
1374
1375	// make sure "offset" is page aligned - but also remember the page offset
1376	int32_t pageOffset = offset & (FSSH_B_PAGE_SIZE - 1);
1377	size = PAGE_ALIGN(size + pageOffset);
1378	offset -= pageOffset;
1379
1380	while (true) {
1381		fssh_size_t chunkSize = size;
1382		if (chunkSize > (MAX_IO_VECS * FSSH_B_PAGE_SIZE))
1383			chunkSize = MAX_IO_VECS * FSSH_B_PAGE_SIZE;
1384
1385		fssh_status_t status = read_chunk_into_cache(ref, offset, chunkSize, pageOffset,
1386								buffer, bufferSize);
1387		if (status != FSSH_B_OK)
1388			return status;
1389
1390		if ((size -= chunkSize) == 0)
1391			return FSSH_B_OK;
1392
1393		if (chunkSize >= bufferSize) {
1394			bufferSize = 0;
1395			buffer = 0;
1396		} else {
1397			bufferSize -= chunkSize - pageOffset;
1398			buffer += chunkSize - pageOffset;
1399		}
1400
1401		offset += chunkSize;
1402		pageOffset = 0;
1403	}
1404
1405	return FSSH_B_OK;
1406}
1407
1408
1409/**	Like read_chunk_into_cache() but writes data into the cache */
1410
1411static inline fssh_status_t
1412write_chunk_to_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t numBytes,
1413	int32_t pageOffset, fssh_addr_t buffer, fssh_size_t bufferSize)
1414{
1415	fssh_iovec vecs[MAX_IO_VECS];
1416	int32_t vecCount = 0;
1417	vm_page *pages[MAX_IO_VECS];
1418	int32_t pageIndex = 0;
1419	fssh_status_t status = FSSH_B_OK;
1420
1421	// ToDo: this should be settable somewhere
1422	bool writeThrough = false;
1423
1424	// allocate pages for the cache and mark them busy
1425	for (fssh_size_t pos = 0; pos < numBytes; pos += FSSH_B_PAGE_SIZE) {
1426		// ToDo: if space is becoming tight, and this cache is already grown
1427		//	big - shouldn't we better steal the pages directly in that case?
1428		//	(a working set like approach for the file cache)
1429		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
1430		page->state = PAGE_STATE_BUSY;
1431
1432		vm_cache_insert_page(ref, page, offset + pos);
1433
1434		fssh_addr_t virtualAddress = page->Address();
1435
1436		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, FSSH_B_PAGE_SIZE);
1437		// ToDo: check if the array is large enough!
1438	}
1439
1440	mutex_unlock(&ref->lock);
1441
1442	// copy contents (and read in partially written pages first)
1443
1444	if (pageOffset != 0) {
1445		// This is only a partial write, so we have to read the rest of the page
1446		// from the file to have consistent data in the cache
1447		fssh_iovec readVec = { vecs[0].iov_base, FSSH_B_PAGE_SIZE };
1448		fssh_size_t bytesRead = FSSH_B_PAGE_SIZE;
1449
1450		status = pages_io(ref, offset, &readVec, 1, &bytesRead, false);
1451		// ToDo: handle errors for real!
1452		if (status < FSSH_B_OK)
1453			fssh_panic("1. pages_io() failed: %s!\n", fssh_strerror(status));
1454	}
1455
1456	fssh_addr_t lastPageOffset = (pageOffset + bufferSize) & (FSSH_B_PAGE_SIZE - 1);
1457	if (lastPageOffset != 0) {
1458		// get the last page in the I/O vectors
1459		fssh_addr_t last = (fssh_addr_t)vecs[vecCount - 1].iov_base
1460			+ vecs[vecCount - 1].iov_len - FSSH_B_PAGE_SIZE;
1461
1462		if (offset + pageOffset + bufferSize == ref->virtual_size) {
1463			// the space in the page after this write action needs to be cleaned
1464			fssh_memset((void *)(last + lastPageOffset), 0, FSSH_B_PAGE_SIZE - lastPageOffset);
1465		} else if (vecCount > 1) {
1466			// the end of this write does not happen on a page boundary, so we
1467			// need to fetch the last page before we can update it
1468			fssh_iovec readVec = { (void *)last, FSSH_B_PAGE_SIZE };
1469			fssh_size_t bytesRead = FSSH_B_PAGE_SIZE;
1470
1471			status = pages_io(ref, offset + numBytes - FSSH_B_PAGE_SIZE, &readVec, 1,
1472				&bytesRead, false);
1473			// ToDo: handle errors for real!
1474			if (status < FSSH_B_OK)
1475				fssh_panic("pages_io() failed: %s!\n", fssh_strerror(status));
1476		}
1477	}
1478
1479	for (int32_t i = 0; i < vecCount; i++) {
1480		fssh_addr_t base = (fssh_addr_t)vecs[i].iov_base;
1481		fssh_size_t bytes = fssh_min_c(bufferSize, fssh_size_t(vecs[i].iov_len - pageOffset));
1482
1483		// copy data from user buffer
1484		user_memcpy((void *)(base + pageOffset), (void *)buffer, bytes);
1485
1486		bufferSize -= bytes;
1487		if (bufferSize == 0)
1488			break;
1489
1490		buffer += bytes;
1491		pageOffset = 0;
1492	}
1493
1494	if (writeThrough) {
1495		// write cached pages back to the file if we were asked to do that
1496		fssh_status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes, true);
1497		if (status < FSSH_B_OK) {
1498			// ToDo: remove allocated pages, ...?
1499			fssh_panic("file_cache: remove allocated pages! write pages failed: %s\n",
1500				fssh_strerror(status));
1501		}
1502	}
1503
1504	mutex_lock(&ref->lock);
1505
1506	// unmap the pages again
1507
1508	// make the pages accessible in the cache
1509	for (int32_t i = pageIndex; i-- > 0;) {
1510		if (writeThrough)
1511			vm_page_set_state(pages[i], PAGE_STATE_ACTIVE);
1512		else
1513			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
1514		vm_page_put_page(pages[i]);
1515	}
1516
1517	return status;
1518}
1519
1520
1521/**	Like read_into_cache() but writes data into the cache. To preserve data consistency,
1522 *	it might also read pages into the cache, though, if only a partial page gets written.
1523 *	The cache_ref lock must be hold when calling this function.
1524 */
1525
1526static fssh_status_t
1527write_to_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t size, fssh_addr_t buffer, fssh_size_t bufferSize)
1528{
1529	TRACE(("write_to_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
1530		ref, offset, size, (void *)buffer, bufferSize));
1531
1532	// make sure "offset" is page aligned - but also remember the page offset
1533	int32_t pageOffset = offset & (FSSH_B_PAGE_SIZE - 1);
1534	size = PAGE_ALIGN(size + pageOffset);
1535	offset -= pageOffset;
1536
1537	while (true) {
1538		fssh_size_t chunkSize = size;
1539		if (chunkSize > (MAX_IO_VECS * FSSH_B_PAGE_SIZE))
1540			chunkSize = MAX_IO_VECS * FSSH_B_PAGE_SIZE;
1541
1542		fssh_status_t status = write_chunk_to_cache(ref, offset, chunkSize, pageOffset, buffer, bufferSize);
1543		if (status != FSSH_B_OK)
1544			return status;
1545
1546		if ((size -= chunkSize) == 0)
1547			return FSSH_B_OK;
1548
1549		if (chunkSize >= bufferSize) {
1550			bufferSize = 0;
1551			buffer = 0;
1552		} else {
1553			bufferSize -= chunkSize - pageOffset;
1554			buffer += chunkSize - pageOffset;
1555		}
1556
1557		offset += chunkSize;
1558		pageOffset = 0;
1559	}
1560
1561	return FSSH_B_OK;
1562}
1563
1564
1565static fssh_status_t
1566satisfy_cache_io(file_cache_ref *ref, fssh_off_t offset, fssh_addr_t buffer, fssh_addr_t lastBuffer,
1567	bool doWrite)
1568{
1569	fssh_size_t requestSize = buffer - lastBuffer;
1570
1571	if (doWrite)
1572		return write_to_cache(ref, offset, requestSize, lastBuffer, requestSize);
1573
1574	return read_into_cache(ref, offset, requestSize, lastBuffer, requestSize);
1575}
1576
1577
1578static fssh_status_t
1579cache_io(void *_cacheRef, fssh_off_t offset, fssh_addr_t buffer, fssh_size_t *_size, bool doWrite)
1580{
1581	if (_cacheRef == NULL)
1582		fssh_panic("cache_io() called with NULL ref!\n");
1583
1584	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1585	fssh_off_t fileSize = ref->virtual_size;
1586
1587	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
1588		ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read"));
1589
1590	// out of bounds access?
1591	if (offset >= fileSize || offset < 0) {
1592		*_size = 0;
1593		return FSSH_B_OK;
1594	}
1595
1596	int32_t pageOffset = offset & (FSSH_B_PAGE_SIZE - 1);
1597	fssh_size_t size = *_size;
1598	offset -= pageOffset;
1599
1600	if (offset + pageOffset + size > fileSize) {
1601		// adapt size to be within the file's offsets
1602		size = fileSize - pageOffset - offset;
1603		*_size = size;
1604	}
1605
1606	// "offset" and "lastOffset" are always aligned to FSSH_B_PAGE_SIZE,
1607	// the "last*" variables always point to the end of the last
1608	// satisfied request part
1609
1610	fssh_size_t bytesLeft = size, lastLeft = size;
1611	int32_t lastPageOffset = pageOffset;
1612	fssh_addr_t lastBuffer = buffer;
1613	fssh_off_t lastOffset = offset;
1614
1615	mutex_lock(&ref->lock);
1616
1617	for (; bytesLeft > 0; offset += FSSH_B_PAGE_SIZE) {
1618		// check if this page is already in memory
1619	restart:
1620		vm_page *page = vm_cache_lookup_page(ref, offset);
1621		PagePutter pagePutter(page);
1622
1623		if (page != NULL) {
1624			// The page is busy - since we need to unlock the cache sometime
1625			// in the near future, we need to satisfy the request of the pages
1626			// we didn't get yet (to make sure no one else interferes in the
1627			// mean time).
1628			fssh_status_t status = FSSH_B_OK;
1629
1630			if (lastBuffer != buffer) {
1631				status = satisfy_cache_io(ref, lastOffset + lastPageOffset,
1632					buffer, lastBuffer, doWrite);
1633				if (status == FSSH_B_OK) {
1634					lastBuffer = buffer;
1635					lastLeft = bytesLeft;
1636					lastOffset = offset;
1637					lastPageOffset = 0;
1638					pageOffset = 0;
1639				}
1640			}
1641
1642			if (status != FSSH_B_OK) {
1643				mutex_unlock(&ref->lock);
1644				return status;
1645			}
1646
1647			if (page->state == PAGE_STATE_BUSY) {
1648				mutex_unlock(&ref->lock);
1649				// ToDo: don't wait forever!
1650				fssh_snooze(20000);
1651				mutex_lock(&ref->lock);
1652				goto restart;
1653			}
1654		}
1655
1656		fssh_size_t bytesInPage = fssh_min_c(fssh_size_t(FSSH_B_PAGE_SIZE - pageOffset), bytesLeft);
1657		fssh_addr_t virtualAddress;
1658
1659		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset = %lu\n", offset, page, bytesLeft, pageOffset));
1660		if (page != NULL) {
1661			virtualAddress = page->Address();
1662
1663			// and copy the contents of the page already in memory
1664			if (doWrite) {
1665				user_memcpy((void *)(virtualAddress + pageOffset), (void *)buffer, bytesInPage);
1666
1667				// make sure the page is in the modified list
1668				if (page->state != PAGE_STATE_MODIFIED)
1669					vm_page_set_state(page, PAGE_STATE_MODIFIED);
1670			} else
1671				user_memcpy((void *)buffer, (void *)(virtualAddress + pageOffset), bytesInPage);
1672
1673			if (bytesLeft <= bytesInPage) {
1674				// we've read the last page, so we're done!
1675				mutex_unlock(&ref->lock);
1676				return FSSH_B_OK;
1677			}
1678
1679			// prepare a potential gap request
1680			lastBuffer = buffer + bytesInPage;
1681			lastLeft = bytesLeft - bytesInPage;
1682			lastOffset = offset + FSSH_B_PAGE_SIZE;
1683			lastPageOffset = 0;
1684		}
1685
1686		if (bytesLeft <= bytesInPage)
1687			break;
1688
1689		buffer += bytesInPage;
1690		bytesLeft -= bytesInPage;
1691		pageOffset = 0;
1692	}
1693
1694	// fill the last remaining bytes of the request (either write or read)
1695
1696	fssh_status_t status;
1697	if (doWrite)
1698		status = write_to_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
1699	else
1700		status = read_into_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
1701
1702	mutex_unlock(&ref->lock);
1703	return status;
1704}
1705
1706
1707}	// namespace FSShell
1708
1709
1710//	#pragma mark - public FS API
1711
1712
1713using namespace FSShell;
1714
1715
1716void *
1717fssh_file_cache_create(fssh_mount_id mountID, fssh_vnode_id vnodeID,
1718	fssh_off_t size, int fd)
1719{
1720	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld, fd = %d)\n", mountID, vnodeID, size, fd));
1721
1722	file_cache_ref *ref = new(nothrow) file_cache_ref;
1723	if (ref == NULL)
1724		return NULL;
1725
1726	ref->mountID = mountID;
1727	ref->nodeID = vnodeID;
1728	ref->deviceFD = fd;
1729	ref->virtual_size = size;
1730
1731	// get vnode
1732	fssh_status_t error = vfs_get_vnode(mountID, vnodeID, &ref->node);
1733	if (error != FSSH_B_OK) {
1734		fssh_dprintf("file_cache_create(): Failed get vnode %d:%lld: %s\n",
1735			mountID, vnodeID, fssh_strerror(error));
1736		delete ref;
1737		return NULL;
1738	}
1739
1740	// create lock
1741	char buffer[32];
1742	fssh_snprintf(buffer, sizeof(buffer), "file cache %d:%lld", (int)mountID, vnodeID);
1743	error = mutex_init(&ref->lock, buffer);
1744	if (error != FSSH_B_OK) {
1745		fssh_dprintf("file_cache_create(): Failed to init mutex: %s\n",
1746			fssh_strerror(error));
1747		vfs_put_vnode(ref->node);
1748		delete ref;
1749		return NULL;
1750	}
1751
1752	return ref;
1753}
1754
1755
1756void
1757fssh_file_cache_delete(void *_cacheRef)
1758{
1759	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1760
1761	if (ref == NULL)
1762		return;
1763
1764	TRACE(("file_cache_delete(ref = %p)\n", ref));
1765
1766	// write all modified pages and resize the cache to 0 to free all pages
1767	// it contains
1768	vm_cache_write_modified(ref, false);
1769
1770	mutex_lock(&ref->lock);
1771	vm_cache_resize(ref, 0);
1772
1773	mutex_destroy(&ref->lock);
1774
1775	vfs_put_vnode(ref->node);
1776
1777	delete ref;
1778}
1779
1780
1781fssh_status_t
1782fssh_file_cache_set_size(void *_cacheRef, fssh_off_t size)
1783{
1784	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1785
1786	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, size));
1787
1788	if (ref == NULL)
1789		return FSSH_B_OK;
1790
1791	fssh_file_cache_invalidate_file_map(_cacheRef, 0, size);
1792		// ToDo: make this better (we would only need to extend or shrink the map)
1793
1794	mutex_lock(&ref->lock);
1795	fssh_status_t status = vm_cache_resize(ref, size);
1796	mutex_unlock(&ref->lock);
1797
1798	return status;
1799}
1800
1801
1802fssh_status_t
1803fssh_file_cache_sync(void *_cacheRef)
1804{
1805	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1806	if (ref == NULL)
1807		return FSSH_B_BAD_VALUE;
1808
1809	return vm_cache_write_modified(ref, true);
1810}
1811
1812
1813fssh_status_t
1814fssh_file_cache_read_pages(void *_cacheRef, fssh_off_t offset, const fssh_iovec *vecs, fssh_size_t count, fssh_size_t *_numBytes)
1815{
1816	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1817
1818	return pages_io(ref, offset, vecs, count, _numBytes, false);
1819}
1820
1821
1822fssh_status_t
1823fssh_file_cache_write_pages(void *_cacheRef, fssh_off_t offset, const fssh_iovec *vecs, fssh_size_t count, fssh_size_t *_numBytes)
1824{
1825	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1826
1827	fssh_status_t status = pages_io(ref, offset, vecs, count, _numBytes, true);
1828	TRACE(("file_cache_write_pages(ref = %p, offset = %Ld, vecs = %p, count = %lu, bytes = %lu) = %ld\n",
1829		ref, offset, vecs, count, *_numBytes, status));
1830
1831	return status;
1832}
1833
1834
1835fssh_status_t
1836fssh_file_cache_read(void *_cacheRef, fssh_off_t offset, void *bufferBase, fssh_size_t *_size)
1837{
1838	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1839
1840	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1841		ref, offset, bufferBase, *_size));
1842
1843	return cache_io(ref, offset, (fssh_addr_t)bufferBase, _size, false);
1844}
1845
1846
1847fssh_status_t
1848fssh_file_cache_write(void *_cacheRef, fssh_off_t offset, const void *buffer, fssh_size_t *_size)
1849{
1850	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1851
1852	fssh_status_t status = cache_io(ref, offset, (fssh_addr_t)const_cast<void *>(buffer), _size, true);
1853	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu) = %ld\n",
1854		ref, offset, buffer, *_size, status));
1855
1856	return status;
1857}
1858
1859
1860fssh_status_t
1861fssh_file_cache_invalidate_file_map(void *_cacheRef, fssh_off_t offset, fssh_off_t size)
1862{
1863	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1864
1865	// ToDo: honour offset/size parameters
1866
1867	TRACE(("file_cache_invalidate_file_map(offset = %Ld, size = %Ld)\n", offset, size));
1868	mutex_lock(&ref->lock);
1869	ref->map.Free();
1870	mutex_unlock(&ref->lock);
1871	return FSSH_B_OK;
1872}
1873