1/*
2 * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2018, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11/*! Virtual File System and File System Interface Layer */
12
13
14#include <ctype.h>
15#include <fcntl.h>
16#include <limits.h>
17#include <stddef.h>
18#include <stdio.h>
19#include <string.h>
20#include <sys/file.h>
21#include <sys/resource.h>
22#include <sys/stat.h>
23#include <unistd.h>
24
25#include <fs_attr.h>
26#include <fs_info.h>
27#include <fs_interface.h>
28#include <fs_volume.h>
29#include <NodeMonitor.h>
30#include <OS.h>
31#include <StorageDefs.h>
32
33#include <AutoDeleter.h>
34#include <block_cache.h>
35#include <boot/kernel_args.h>
36#include <debug_heap.h>
37#include <disk_device_manager/KDiskDevice.h>
38#include <disk_device_manager/KDiskDeviceManager.h>
39#include <disk_device_manager/KDiskDeviceUtils.h>
40#include <disk_device_manager/KDiskSystem.h>
41#include <fd.h>
42#include <file_cache.h>
43#include <fs/node_monitor.h>
44#include <KPath.h>
45#include <lock.h>
46#include <low_resource_manager.h>
47#include <slab/Slab.h>
48#include <syscalls.h>
49#include <syscall_restart.h>
50#include <tracing.h>
51#include <util/atomic.h>
52#include <util/AutoLock.h>
53#include <util/DoublyLinkedList.h>
54#include <vfs.h>
55#include <vm/vm.h>
56#include <vm/VMCache.h>
57#include <wait_for_objects.h>
58
59#include "EntryCache.h"
60#include "fifo.h"
61#include "IORequest.h"
62#include "unused_vnodes.h"
63#include "vfs_tracing.h"
64#include "Vnode.h"
65#include "../cache/vnode_store.h"
66
67
68//#define TRACE_VFS
69#ifdef TRACE_VFS
70#	define TRACE(x) dprintf x
71#	define FUNCTION(x) dprintf x
72#else
73#	define TRACE(x) ;
74#	define FUNCTION(x) ;
75#endif
76
77#define ADD_DEBUGGER_COMMANDS
78
79
80#define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
81#define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
82
83#if KDEBUG
84#	define FS_CALL(vnode, op, params...) \
85		( HAS_FS_CALL(vnode, op) ? \
86			vnode->ops->op(vnode->mount->volume, vnode, params) \
87			: (panic("FS_CALL op " #op " is NULL"), 0))
88#	define FS_CALL_NO_PARAMS(vnode, op) \
89		( HAS_FS_CALL(vnode, op) ? \
90			vnode->ops->op(vnode->mount->volume, vnode) \
91			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
92#	define FS_MOUNT_CALL(mount, op, params...) \
93		( HAS_FS_MOUNT_CALL(mount, op) ? \
94			mount->volume->ops->op(mount->volume, params) \
95			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
96#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
97		( HAS_FS_MOUNT_CALL(mount, op) ? \
98			mount->volume->ops->op(mount->volume) \
99			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
100#else
101#	define FS_CALL(vnode, op, params...) \
102			vnode->ops->op(vnode->mount->volume, vnode, params)
103#	define FS_CALL_NO_PARAMS(vnode, op) \
104			vnode->ops->op(vnode->mount->volume, vnode)
105#	define FS_MOUNT_CALL(mount, op, params...) \
106			mount->volume->ops->op(mount->volume, params)
107#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
108			mount->volume->ops->op(mount->volume)
109#endif
110
111
112const static size_t kMaxPathLength = 65536;
113	// The absolute maximum path length (for getcwd() - this is not depending
114	// on PATH_MAX
115
116
117typedef DoublyLinkedList<vnode> VnodeList;
118
119/*!	\brief Structure to manage a mounted file system
120
121	Note: The root_vnode and root_vnode->covers fields (what others?) are
122	initialized in fs_mount() and not changed afterwards. That is as soon
123	as the mount is mounted and it is made sure it won't be unmounted
124	(e.g. by holding a reference to a vnode of that mount) (read) access
125	to those fields is always safe, even without additional locking. Morever
126	while mounted the mount holds a reference to the root_vnode->covers vnode,
127	and thus making the access path vnode->mount->root_vnode->covers->mount->...
128	safe if a reference to vnode is held (note that for the root mount
129	root_vnode->covers is NULL, though).
130*/
131struct fs_mount {
132	fs_mount()
133		:
134		volume(NULL),
135		device_name(NULL)
136	{
137		mutex_init(&lock, "mount lock");
138	}
139
140	~fs_mount()
141	{
142		mutex_destroy(&lock);
143		free(device_name);
144
145		while (volume) {
146			fs_volume* superVolume = volume->super_volume;
147
148			if (volume->file_system != NULL)
149				put_module(volume->file_system->info.name);
150
151			free(volume->file_system_name);
152			free(volume);
153			volume = superVolume;
154		}
155	}
156
157	struct fs_mount* next;
158	dev_t			id;
159	fs_volume*		volume;
160	char*			device_name;
161	mutex			lock;	// guards the vnodes list
162	struct vnode*	root_vnode;
163	struct vnode*	covers_vnode;	// immutable
164	KPartition*		partition;
165	VnodeList		vnodes;
166	EntryCache		entry_cache;
167	bool			unmounting;
168	bool			owns_file_device;
169};
170
171
172namespace {
173
174struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
175	list_link		link;
176	void*			bound_to;
177	team_id			team;
178	pid_t			session;
179	off_t			start;
180	off_t			end;
181	bool			shared;
182};
183
184typedef DoublyLinkedList<advisory_lock> LockList;
185
186} // namespace
187
188
189struct advisory_locking {
190	sem_id			lock;
191	sem_id			wait_sem;
192	LockList		locks;
193
194	advisory_locking()
195		:
196		lock(-1),
197		wait_sem(-1)
198	{
199	}
200
201	~advisory_locking()
202	{
203		if (lock >= 0)
204			delete_sem(lock);
205		if (wait_sem >= 0)
206			delete_sem(wait_sem);
207	}
208};
209
210/*!	\brief Guards sMountsTable.
211
212	The holder is allowed to read/write access the sMountsTable.
213	Manipulation of the fs_mount structures themselves
214	(and their destruction) requires different locks though.
215*/
216static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
217
218/*!	\brief Guards mount/unmount operations.
219
220	The fs_mount() and fs_unmount() hold the lock during their whole operation.
221	That is locking the lock ensures that no FS is mounted/unmounted. In
222	particular this means that
223	- sMountsTable will not be modified,
224	- the fields immutable after initialization of the fs_mount structures in
225	  sMountsTable will not be modified,
226
227	The thread trying to lock the lock must not hold sVnodeLock or
228	sMountMutex.
229*/
230static recursive_lock sMountOpLock;
231
232/*!	\brief Guards sVnodeTable.
233
234	The holder is allowed read/write access to sVnodeTable and to
235	any unbusy vnode in that table, save to the immutable fields (device, id,
236	private_node, mount) to which only read-only access is allowed.
237	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
238	well as the busy, removed, unused flags, and the vnode's type can also be
239	write accessed when holding a read lock to sVnodeLock *and* having the vnode
240	locked. Write access to covered_by and covers requires to write lock
241	sVnodeLock.
242
243	The thread trying to acquire the lock must not hold sMountMutex.
244	You must not hold this lock when calling create_sem(), as this might call
245	vfs_free_unused_vnodes() and thus cause a deadlock.
246*/
247static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
248
249/*!	\brief Guards io_context::root.
250
251	Must be held when setting or getting the io_context::root field.
252	The only operation allowed while holding this lock besides getting or
253	setting the field is inc_vnode_ref_count() on io_context::root.
254*/
255static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
256
257
258namespace {
259
260struct vnode_hash_key {
261	dev_t	device;
262	ino_t	vnode;
263};
264
265struct VnodeHash {
266	typedef vnode_hash_key	KeyType;
267	typedef	struct vnode	ValueType;
268
269#define VHASH(mountid, vnodeid) \
270	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
271
272	size_t HashKey(KeyType key) const
273	{
274		return VHASH(key.device, key.vnode);
275	}
276
277	size_t Hash(ValueType* vnode) const
278	{
279		return VHASH(vnode->device, vnode->id);
280	}
281
282#undef VHASH
283
284	bool Compare(KeyType key, ValueType* vnode) const
285	{
286		return vnode->device == key.device && vnode->id == key.vnode;
287	}
288
289	ValueType*& GetLink(ValueType* value) const
290	{
291		return value->next;
292	}
293};
294
295typedef BOpenHashTable<VnodeHash> VnodeTable;
296
297
298struct MountHash {
299	typedef dev_t			KeyType;
300	typedef	struct fs_mount	ValueType;
301
302	size_t HashKey(KeyType key) const
303	{
304		return key;
305	}
306
307	size_t Hash(ValueType* mount) const
308	{
309		return mount->id;
310	}
311
312	bool Compare(KeyType key, ValueType* mount) const
313	{
314		return mount->id == key;
315	}
316
317	ValueType*& GetLink(ValueType* value) const
318	{
319		return value->next;
320	}
321};
322
323typedef BOpenHashTable<MountHash> MountTable;
324
325} // namespace
326
327
328object_cache* sPathNameCache;
329object_cache* sFileDescriptorCache;
330
331#define VNODE_HASH_TABLE_SIZE 1024
332static VnodeTable* sVnodeTable;
333static struct vnode* sRoot;
334
335#define MOUNTS_HASH_TABLE_SIZE 16
336static MountTable* sMountsTable;
337static dev_t sNextMountID = 1;
338
339#define MAX_TEMP_IO_VECS 8
340
341// How long to wait for busy vnodes (10s)
342#define BUSY_VNODE_RETRIES 2000
343#define BUSY_VNODE_DELAY 5000
344
345mode_t __gUmask = 022;
346
347/* function declarations */
348
349static void free_unused_vnodes();
350
351// file descriptor operation prototypes
352static status_t file_read(struct file_descriptor* descriptor, off_t pos,
353	void* buffer, size_t* _bytes);
354static status_t file_write(struct file_descriptor* descriptor, off_t pos,
355	const void* buffer, size_t* _bytes);
356static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
357	int seekType);
358static void file_free_fd(struct file_descriptor* descriptor);
359static status_t file_close(struct file_descriptor* descriptor);
360static status_t file_select(struct file_descriptor* descriptor, uint8 event,
361	struct selectsync* sync);
362static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
363	struct selectsync* sync);
364static status_t dir_read(struct io_context* context,
365	struct file_descriptor* descriptor, struct dirent* buffer,
366	size_t bufferSize, uint32* _count);
367static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
368	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
369static status_t dir_rewind(struct file_descriptor* descriptor);
370static void dir_free_fd(struct file_descriptor* descriptor);
371static status_t dir_close(struct file_descriptor* descriptor);
372static status_t attr_dir_read(struct io_context* context,
373	struct file_descriptor* descriptor, struct dirent* buffer,
374	size_t bufferSize, uint32* _count);
375static status_t attr_dir_rewind(struct file_descriptor* descriptor);
376static void attr_dir_free_fd(struct file_descriptor* descriptor);
377static status_t attr_dir_close(struct file_descriptor* descriptor);
378static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
379	void* buffer, size_t* _bytes);
380static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
381	const void* buffer, size_t* _bytes);
382static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
383	int seekType);
384static void attr_free_fd(struct file_descriptor* descriptor);
385static status_t attr_close(struct file_descriptor* descriptor);
386static status_t attr_read_stat(struct file_descriptor* descriptor,
387	struct stat* statData);
388static status_t attr_write_stat(struct file_descriptor* descriptor,
389	const struct stat* stat, int statMask);
390static status_t index_dir_read(struct io_context* context,
391	struct file_descriptor* descriptor, struct dirent* buffer,
392	size_t bufferSize, uint32* _count);
393static status_t index_dir_rewind(struct file_descriptor* descriptor);
394static void index_dir_free_fd(struct file_descriptor* descriptor);
395static status_t index_dir_close(struct file_descriptor* descriptor);
396static status_t query_read(struct io_context* context,
397	struct file_descriptor* descriptor, struct dirent* buffer,
398	size_t bufferSize, uint32* _count);
399static status_t query_rewind(struct file_descriptor* descriptor);
400static void query_free_fd(struct file_descriptor* descriptor);
401static status_t query_close(struct file_descriptor* descriptor);
402
403static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
404	void* buffer, size_t length);
405static status_t common_read_stat(struct file_descriptor* descriptor,
406	struct stat* statData);
407static status_t common_write_stat(struct file_descriptor* descriptor,
408	const struct stat* statData, int statMask);
409static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
410	struct stat* stat, bool kernel);
411
412static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
413	bool traverseLeafLink, int count, bool kernel,
414	struct vnode** _vnode, ino_t* _parentID);
415static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
416	size_t bufferSize, bool kernel);
417static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
418	struct vnode** _vnode, ino_t* _parentID, bool kernel);
419static void inc_vnode_ref_count(struct vnode* vnode);
420static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
421	bool reenter);
422static inline void put_vnode(struct vnode* vnode);
423static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
424	bool kernel);
425static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
426
427
428static struct fd_ops sFileOps = {
429	file_read,
430	file_write,
431	file_seek,
432	common_ioctl,
433	NULL,		// set_flags
434	file_select,
435	file_deselect,
436	NULL,		// read_dir()
437	NULL,		// rewind_dir()
438	common_read_stat,
439	common_write_stat,
440	file_close,
441	file_free_fd
442};
443
444static struct fd_ops sDirectoryOps = {
445	NULL,		// read()
446	NULL,		// write()
447	NULL,		// seek()
448	common_ioctl,
449	NULL,		// set_flags
450	NULL,		// select()
451	NULL,		// deselect()
452	dir_read,
453	dir_rewind,
454	common_read_stat,
455	common_write_stat,
456	dir_close,
457	dir_free_fd
458};
459
460static struct fd_ops sAttributeDirectoryOps = {
461	NULL,		// read()
462	NULL,		// write()
463	NULL,		// seek()
464	common_ioctl,
465	NULL,		// set_flags
466	NULL,		// select()
467	NULL,		// deselect()
468	attr_dir_read,
469	attr_dir_rewind,
470	common_read_stat,
471	common_write_stat,
472	attr_dir_close,
473	attr_dir_free_fd
474};
475
476static struct fd_ops sAttributeOps = {
477	attr_read,
478	attr_write,
479	attr_seek,
480	common_ioctl,
481	NULL,		// set_flags
482	NULL,		// select()
483	NULL,		// deselect()
484	NULL,		// read_dir()
485	NULL,		// rewind_dir()
486	attr_read_stat,
487	attr_write_stat,
488	attr_close,
489	attr_free_fd
490};
491
492static struct fd_ops sIndexDirectoryOps = {
493	NULL,		// read()
494	NULL,		// write()
495	NULL,		// seek()
496	NULL,		// ioctl()
497	NULL,		// set_flags
498	NULL,		// select()
499	NULL,		// deselect()
500	index_dir_read,
501	index_dir_rewind,
502	NULL,		// read_stat()
503	NULL,		// write_stat()
504	index_dir_close,
505	index_dir_free_fd
506};
507
508#if 0
509static struct fd_ops sIndexOps = {
510	NULL,		// read()
511	NULL,		// write()
512	NULL,		// seek()
513	NULL,		// ioctl()
514	NULL,		// set_flags
515	NULL,		// select()
516	NULL,		// deselect()
517	NULL,		// dir_read()
518	NULL,		// dir_rewind()
519	index_read_stat,	// read_stat()
520	NULL,		// write_stat()
521	NULL,		// dir_close()
522	NULL		// free_fd()
523};
524#endif
525
526static struct fd_ops sQueryOps = {
527	NULL,		// read()
528	NULL,		// write()
529	NULL,		// seek()
530	NULL,		// ioctl()
531	NULL,		// set_flags
532	NULL,		// select()
533	NULL,		// deselect()
534	query_read,
535	query_rewind,
536	NULL,		// read_stat()
537	NULL,		// write_stat()
538	query_close,
539	query_free_fd
540};
541
542
543namespace {
544
545class VNodePutter {
546public:
547	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
548
549	~VNodePutter()
550	{
551		Put();
552	}
553
554	void SetTo(struct vnode* vnode)
555	{
556		Put();
557		fVNode = vnode;
558	}
559
560	void Put()
561	{
562		if (fVNode) {
563			put_vnode(fVNode);
564			fVNode = NULL;
565		}
566	}
567
568	struct vnode* Detach()
569	{
570		struct vnode* vnode = fVNode;
571		fVNode = NULL;
572		return vnode;
573	}
574
575private:
576	struct vnode* fVNode;
577};
578
579
580class FDCloser {
581public:
582	FDCloser() : fFD(-1), fKernel(true) {}
583
584	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
585
586	~FDCloser()
587	{
588		Close();
589	}
590
591	void SetTo(int fd, bool kernel)
592	{
593		Close();
594		fFD = fd;
595		fKernel = kernel;
596	}
597
598	void Close()
599	{
600		if (fFD >= 0) {
601			if (fKernel)
602				_kern_close(fFD);
603			else
604				_user_close(fFD);
605			fFD = -1;
606		}
607	}
608
609	int Detach()
610	{
611		int fd = fFD;
612		fFD = -1;
613		return fd;
614	}
615
616private:
617	int		fFD;
618	bool	fKernel;
619};
620
621} // namespace
622
623
624#if VFS_PAGES_IO_TRACING
625
626namespace VFSPagesIOTracing {
627
628class PagesIOTraceEntry : public AbstractTraceEntry {
629protected:
630	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
631		const generic_io_vec* vecs, uint32 count, uint32 flags,
632		generic_size_t bytesRequested, status_t status,
633		generic_size_t bytesTransferred)
634		:
635		fVnode(vnode),
636		fMountID(vnode->mount->id),
637		fNodeID(vnode->id),
638		fCookie(cookie),
639		fPos(pos),
640		fCount(count),
641		fFlags(flags),
642		fBytesRequested(bytesRequested),
643		fStatus(status),
644		fBytesTransferred(bytesTransferred)
645	{
646		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
647			sizeof(generic_io_vec) * count, false);
648	}
649
650	void AddDump(TraceOutput& out, const char* mode)
651	{
652		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
653			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
654			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
655			(uint64)fBytesRequested);
656
657		if (fVecs != NULL) {
658			for (uint32 i = 0; i < fCount; i++) {
659				if (i > 0)
660					out.Print(", ");
661				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
662					(uint64)fVecs[i].length);
663			}
664		}
665
666		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
667			"transferred: %" B_PRIu64, fFlags, fStatus,
668			(uint64)fBytesTransferred);
669	}
670
671protected:
672	struct vnode*	fVnode;
673	dev_t			fMountID;
674	ino_t			fNodeID;
675	void*			fCookie;
676	off_t			fPos;
677	generic_io_vec*	fVecs;
678	uint32			fCount;
679	uint32			fFlags;
680	generic_size_t	fBytesRequested;
681	status_t		fStatus;
682	generic_size_t	fBytesTransferred;
683};
684
685
686class ReadPages : public PagesIOTraceEntry {
687public:
688	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
689		const generic_io_vec* vecs, uint32 count, uint32 flags,
690		generic_size_t bytesRequested, status_t status,
691		generic_size_t bytesTransferred)
692		:
693		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
694			bytesRequested, status, bytesTransferred)
695	{
696		Initialized();
697	}
698
699	virtual void AddDump(TraceOutput& out)
700	{
701		PagesIOTraceEntry::AddDump(out, "read");
702	}
703};
704
705
706class WritePages : public PagesIOTraceEntry {
707public:
708	WritePages(struct vnode* vnode, void* cookie, off_t pos,
709		const generic_io_vec* vecs, uint32 count, uint32 flags,
710		generic_size_t bytesRequested, status_t status,
711		generic_size_t bytesTransferred)
712		:
713		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
714			bytesRequested, status, bytesTransferred)
715	{
716		Initialized();
717	}
718
719	virtual void AddDump(TraceOutput& out)
720	{
721		PagesIOTraceEntry::AddDump(out, "write");
722	}
723};
724
725}	// namespace VFSPagesIOTracing
726
727#	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
728#else
729#	define TPIO(x) ;
730#endif	// VFS_PAGES_IO_TRACING
731
732
733/*! Finds the mounted device (the fs_mount structure) with the given ID.
734	Note, you must hold the gMountMutex lock when you call this function.
735*/
736static struct fs_mount*
737find_mount(dev_t id)
738{
739	ASSERT_LOCKED_MUTEX(&sMountMutex);
740
741	return sMountsTable->Lookup(id);
742}
743
744
745static status_t
746get_mount(dev_t id, struct fs_mount** _mount)
747{
748	struct fs_mount* mount;
749
750	ReadLocker nodeLocker(sVnodeLock);
751	MutexLocker mountLocker(sMountMutex);
752
753	mount = find_mount(id);
754	if (mount == NULL)
755		return B_BAD_VALUE;
756
757	struct vnode* rootNode = mount->root_vnode;
758	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
759		|| rootNode->ref_count == 0) {
760		// might have been called during a mount/unmount operation
761		return B_BUSY;
762	}
763
764	inc_vnode_ref_count(rootNode);
765	*_mount = mount;
766	return B_OK;
767}
768
769
770static void
771put_mount(struct fs_mount* mount)
772{
773	if (mount)
774		put_vnode(mount->root_vnode);
775}
776
777
778/*!	Tries to open the specified file system module.
779	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
780	Returns a pointer to file system module interface, or NULL if it
781	could not open the module.
782*/
783static file_system_module_info*
784get_file_system(const char* fsName)
785{
786	char name[B_FILE_NAME_LENGTH];
787	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
788		// construct module name if we didn't get one
789		// (we currently support only one API)
790		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
791		fsName = NULL;
792	}
793
794	file_system_module_info* info;
795	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
796		return NULL;
797
798	return info;
799}
800
801
802/*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
803	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
804	The name is allocated for you, and you have to free() it when you're
805	done with it.
806	Returns NULL if the required memory is not available.
807*/
808static char*
809get_file_system_name(const char* fsName)
810{
811	const size_t length = strlen("file_systems/");
812
813	if (strncmp(fsName, "file_systems/", length)) {
814		// the name already seems to be the module's file name
815		return strdup(fsName);
816	}
817
818	fsName += length;
819	const char* end = strchr(fsName, '/');
820	if (end == NULL) {
821		// this doesn't seem to be a valid name, but well...
822		return strdup(fsName);
823	}
824
825	// cut off the trailing /v1
826
827	char* name = (char*)malloc(end + 1 - fsName);
828	if (name == NULL)
829		return NULL;
830
831	strlcpy(name, fsName, end + 1 - fsName);
832	return name;
833}
834
835
836/*!	Accepts a list of file system names separated by a colon, one for each
837	layer and returns the file system name for the specified layer.
838	The name is allocated for you, and you have to free() it when you're
839	done with it.
840	Returns NULL if the required memory is not available or if there is no
841	name for the specified layer.
842*/
843static char*
844get_file_system_name_for_layer(const char* fsNames, int32 layer)
845{
846	while (layer >= 0) {
847		const char* end = strchr(fsNames, ':');
848		if (end == NULL) {
849			if (layer == 0)
850				return strdup(fsNames);
851			return NULL;
852		}
853
854		if (layer == 0) {
855			size_t length = end - fsNames + 1;
856			char* result = (char*)malloc(length);
857			strlcpy(result, fsNames, length);
858			return result;
859		}
860
861		fsNames = end + 1;
862		layer--;
863	}
864
865	return NULL;
866}
867
868
869static void
870add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
871{
872	MutexLocker _(mount->lock);
873	mount->vnodes.Add(vnode);
874}
875
876
877static void
878remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
879{
880	MutexLocker _(mount->lock);
881	mount->vnodes.Remove(vnode);
882}
883
884
885/*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
886
887	The caller must hold the sVnodeLock (read lock at least).
888
889	\param mountID the mount ID.
890	\param vnodeID the node ID.
891
892	\return The vnode structure, if it was found in the hash table, \c NULL
893			otherwise.
894*/
895static struct vnode*
896lookup_vnode(dev_t mountID, ino_t vnodeID)
897{
898	struct vnode_hash_key key;
899
900	key.device = mountID;
901	key.vnode = vnodeID;
902
903	return sVnodeTable->Lookup(key);
904}
905
906
907/*!	\brief Checks whether or not a busy vnode should be waited for (again).
908
909	This will also wait for BUSY_VNODE_DELAY before returning if one should
910	still wait for the vnode becoming unbusy.
911
912	\return \c true if one should retry, \c false if not.
913*/
914static bool
915retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
916{
917	if (--tries < 0) {
918		// vnode doesn't seem to become unbusy
919		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
920			" is not becoming unbusy!\n", mountID, vnodeID);
921		return false;
922	}
923	snooze(BUSY_VNODE_DELAY);
924	return true;
925}
926
927
928/*!	Creates a new vnode with the given mount and node ID.
929	If the node already exists, it is returned instead and no new node is
930	created. In either case -- but not, if an error occurs -- the function write
931	locks \c sVnodeLock and keeps it locked for the caller when returning. On
932	error the lock is not held on return.
933
934	\param mountID The mount ID.
935	\param vnodeID The vnode ID.
936	\param _vnode Will be set to the new vnode on success.
937	\param _nodeCreated Will be set to \c true when the returned vnode has
938		been newly created, \c false when it already existed. Will not be
939		changed on error.
940	\return \c B_OK, when the vnode was successfully created and inserted or
941		a node with the given ID was found, \c B_NO_MEMORY or
942		\c B_ENTRY_NOT_FOUND on error.
943*/
944static status_t
945create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
946	bool& _nodeCreated)
947{
948	FUNCTION(("create_new_vnode_and_lock()\n"));
949
950	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
951	if (vnode == NULL)
952		return B_NO_MEMORY;
953
954	// initialize basic values
955	memset(vnode, 0, sizeof(struct vnode));
956	vnode->device = mountID;
957	vnode->id = vnodeID;
958	vnode->ref_count = 1;
959	vnode->SetBusy(true);
960
961	// look up the node -- it might have been added by someone else in the
962	// meantime
963	rw_lock_write_lock(&sVnodeLock);
964	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
965	if (existingVnode != NULL) {
966		free(vnode);
967		_vnode = existingVnode;
968		_nodeCreated = false;
969		return B_OK;
970	}
971
972	// get the mount structure
973	mutex_lock(&sMountMutex);
974	vnode->mount = find_mount(mountID);
975	if (!vnode->mount || vnode->mount->unmounting) {
976		mutex_unlock(&sMountMutex);
977		rw_lock_write_unlock(&sVnodeLock);
978		free(vnode);
979		return B_ENTRY_NOT_FOUND;
980	}
981
982	// add the vnode to the mount's node list and the hash table
983	sVnodeTable->Insert(vnode);
984	add_vnode_to_mount_list(vnode, vnode->mount);
985
986	mutex_unlock(&sMountMutex);
987
988	_vnode = vnode;
989	_nodeCreated = true;
990
991	// keep the vnode lock locked
992	return B_OK;
993}
994
995
996/*!	Frees the vnode and all resources it has acquired, and removes
997	it from the vnode hash as well as from its mount structure.
998	Will also make sure that any cache modifications are written back.
999*/
1000static void
1001free_vnode(struct vnode* vnode, bool reenter)
1002{
1003	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
1004		vnode);
1005	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
1006
1007	// write back any changes in this vnode's cache -- but only
1008	// if the vnode won't be deleted, in which case the changes
1009	// will be discarded
1010
1011	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1012		FS_CALL_NO_PARAMS(vnode, fsync);
1013
1014	// Note: If this vnode has a cache attached, there will still be two
1015	// references to that cache at this point. The last one belongs to the vnode
1016	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1017	// cache. Each but the last reference to a cache also includes a reference
1018	// to the vnode. The file cache, however, released its reference (cf.
1019	// file_cache_create()), so that this vnode's ref count has the chance to
1020	// ever drop to 0. Deleting the file cache now, will cause the next to last
1021	// cache reference to be released, which will also release a (no longer
1022	// existing) vnode reference. To avoid problems, we set the vnode's ref
1023	// count, so that it will neither become negative nor 0.
1024	vnode->ref_count = 2;
1025
1026	if (!vnode->IsUnpublished()) {
1027		if (vnode->IsRemoved())
1028			FS_CALL(vnode, remove_vnode, reenter);
1029		else
1030			FS_CALL(vnode, put_vnode, reenter);
1031	}
1032
1033	// If the vnode has a VMCache attached, make sure that it won't try to get
1034	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1035	// long as the vnode is busy and in the hash, that won't happen, but as
1036	// soon as we've removed it from the hash, it could reload the vnode -- with
1037	// a new cache attached!
1038	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1039		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1040
1041	// The file system has removed the resources of the vnode now, so we can
1042	// make it available again (by removing the busy vnode from the hash).
1043	rw_lock_write_lock(&sVnodeLock);
1044	sVnodeTable->Remove(vnode);
1045	rw_lock_write_unlock(&sVnodeLock);
1046
1047	// if we have a VMCache attached, remove it
1048	if (vnode->cache)
1049		vnode->cache->ReleaseRef();
1050
1051	vnode->cache = NULL;
1052
1053	remove_vnode_from_mount_list(vnode, vnode->mount);
1054
1055	free(vnode);
1056}
1057
1058
1059/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1060	if the counter dropped to 0.
1061
1062	The caller must, of course, own a reference to the vnode to call this
1063	function.
1064	The caller must not hold the sVnodeLock or the sMountMutex.
1065
1066	\param vnode the vnode.
1067	\param alwaysFree don't move this vnode into the unused list, but really
1068		   delete it if possible.
1069	\param reenter \c true, if this function is called (indirectly) from within
1070		   a file system. This will be passed to file system hooks only.
1071	\return \c B_OK, if everything went fine, an error code otherwise.
1072*/
1073static status_t
1074dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1075{
1076	ReadLocker locker(sVnodeLock);
1077	AutoLocker<Vnode> nodeLocker(vnode);
1078
1079	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1080
1081	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1082
1083	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1084		vnode->ref_count));
1085
1086	if (oldRefCount != 1)
1087		return B_OK;
1088
1089	if (vnode->IsBusy())
1090		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1091
1092	bool freeNode = false;
1093	bool freeUnusedNodes = false;
1094
1095	// Just insert the vnode into an unused list if we don't need
1096	// to delete it
1097	if (vnode->IsRemoved() || alwaysFree) {
1098		vnode_to_be_freed(vnode);
1099		vnode->SetBusy(true);
1100		freeNode = true;
1101	} else
1102		freeUnusedNodes = vnode_unused(vnode);
1103
1104	nodeLocker.Unlock();
1105	locker.Unlock();
1106
1107	if (freeNode)
1108		free_vnode(vnode, reenter);
1109	else if (freeUnusedNodes)
1110		free_unused_vnodes();
1111
1112	return B_OK;
1113}
1114
1115
1116/*!	\brief Increments the reference counter of the given vnode.
1117
1118	The caller must make sure that the node isn't deleted while this function
1119	is called. This can be done either:
1120	- by ensuring that a reference to the node exists and remains in existence,
1121	  or
1122	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1123	  or by holding sVnodeLock write locked.
1124
1125	In the second case the caller is responsible for dealing with the ref count
1126	0 -> 1 transition. That is 1. this function must not be invoked when the
1127	node is busy in the first place and 2. vnode_used() must be called for the
1128	node.
1129
1130	\param vnode the vnode.
1131*/
1132static void
1133inc_vnode_ref_count(struct vnode* vnode)
1134{
1135	atomic_add(&vnode->ref_count, 1);
1136	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1137		vnode->ref_count));
1138}
1139
1140
1141static bool
1142is_special_node_type(int type)
1143{
1144	// at the moment only FIFOs are supported
1145	return S_ISFIFO(type);
1146}
1147
1148
1149static status_t
1150create_special_sub_node(struct vnode* vnode, uint32 flags)
1151{
1152	if (S_ISFIFO(vnode->Type()))
1153		return create_fifo_vnode(vnode->mount->volume, vnode);
1154
1155	return B_BAD_VALUE;
1156}
1157
1158
1159/*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1160
1161	If the node is not yet in memory, it will be loaded.
1162
1163	The caller must not hold the sVnodeLock or the sMountMutex.
1164
1165	\param mountID the mount ID.
1166	\param vnodeID the node ID.
1167	\param _vnode Pointer to a vnode* variable into which the pointer to the
1168		   retrieved vnode structure shall be written.
1169	\param reenter \c true, if this function is called (indirectly) from within
1170		   a file system.
1171	\return \c B_OK, if everything when fine, an error code otherwise.
1172*/
1173static status_t
1174get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1175	int reenter)
1176{
1177	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1178		mountID, vnodeID, _vnode));
1179
1180	rw_lock_read_lock(&sVnodeLock);
1181
1182	int32 tries = BUSY_VNODE_RETRIES;
1183restart:
1184	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1185	AutoLocker<Vnode> nodeLocker(vnode);
1186
1187	if (vnode && vnode->IsBusy()) {
1188		nodeLocker.Unlock();
1189		rw_lock_read_unlock(&sVnodeLock);
1190		if (!canWait) {
1191			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1192				mountID, vnodeID);
1193			return B_BUSY;
1194		}
1195		if (!retry_busy_vnode(tries, mountID, vnodeID))
1196			return B_BUSY;
1197
1198		rw_lock_read_lock(&sVnodeLock);
1199		goto restart;
1200	}
1201
1202	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1203
1204	status_t status;
1205
1206	if (vnode) {
1207		if (vnode->ref_count == 0) {
1208			// this vnode has been unused before
1209			vnode_used(vnode);
1210		}
1211		inc_vnode_ref_count(vnode);
1212
1213		nodeLocker.Unlock();
1214		rw_lock_read_unlock(&sVnodeLock);
1215	} else {
1216		// we need to create a new vnode and read it in
1217		rw_lock_read_unlock(&sVnodeLock);
1218			// unlock -- create_new_vnode_and_lock() write-locks on success
1219		bool nodeCreated;
1220		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1221			nodeCreated);
1222		if (status != B_OK)
1223			return status;
1224
1225		if (!nodeCreated) {
1226			rw_lock_read_lock(&sVnodeLock);
1227			rw_lock_write_unlock(&sVnodeLock);
1228			goto restart;
1229		}
1230
1231		rw_lock_write_unlock(&sVnodeLock);
1232
1233		int type;
1234		uint32 flags;
1235		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1236			&flags, reenter);
1237		if (status == B_OK && vnode->private_node == NULL)
1238			status = B_BAD_VALUE;
1239
1240		bool gotNode = status == B_OK;
1241		bool publishSpecialSubNode = false;
1242		if (gotNode) {
1243			vnode->SetType(type);
1244			publishSpecialSubNode = is_special_node_type(type)
1245				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1246		}
1247
1248		if (gotNode && publishSpecialSubNode)
1249			status = create_special_sub_node(vnode, flags);
1250
1251		if (status != B_OK) {
1252			if (gotNode)
1253				FS_CALL(vnode, put_vnode, reenter);
1254
1255			rw_lock_write_lock(&sVnodeLock);
1256			sVnodeTable->Remove(vnode);
1257			remove_vnode_from_mount_list(vnode, vnode->mount);
1258			rw_lock_write_unlock(&sVnodeLock);
1259
1260			free(vnode);
1261			return status;
1262		}
1263
1264		rw_lock_read_lock(&sVnodeLock);
1265		vnode->Lock();
1266
1267		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1268		vnode->SetBusy(false);
1269
1270		vnode->Unlock();
1271		rw_lock_read_unlock(&sVnodeLock);
1272	}
1273
1274	TRACE(("get_vnode: returning %p\n", vnode));
1275
1276	*_vnode = vnode;
1277	return B_OK;
1278}
1279
1280
1281/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1282	if the counter dropped to 0.
1283
1284	The caller must, of course, own a reference to the vnode to call this
1285	function.
1286	The caller must not hold the sVnodeLock or the sMountMutex.
1287
1288	\param vnode the vnode.
1289*/
1290static inline void
1291put_vnode(struct vnode* vnode)
1292{
1293	dec_vnode_ref_count(vnode, false, false);
1294}
1295
1296
1297static void
1298free_unused_vnodes(int32 level)
1299{
1300	unused_vnodes_check_started();
1301
1302	if (level == B_NO_LOW_RESOURCE) {
1303		unused_vnodes_check_done();
1304		return;
1305	}
1306
1307	flush_hot_vnodes();
1308
1309	// determine how many nodes to free
1310	uint32 count = 1;
1311	{
1312		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1313
1314		switch (level) {
1315			case B_LOW_RESOURCE_NOTE:
1316				count = sUnusedVnodes / 100;
1317				break;
1318			case B_LOW_RESOURCE_WARNING:
1319				count = sUnusedVnodes / 10;
1320				break;
1321			case B_LOW_RESOURCE_CRITICAL:
1322				count = sUnusedVnodes;
1323				break;
1324		}
1325
1326		if (count > sUnusedVnodes)
1327			count = sUnusedVnodes;
1328	}
1329
1330	// Write back the modified pages of some unused vnodes and free them.
1331
1332	for (uint32 i = 0; i < count; i++) {
1333		ReadLocker vnodesReadLocker(sVnodeLock);
1334
1335		// get the first node
1336		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1337		struct vnode* vnode = (struct vnode*)list_get_first_item(
1338			&sUnusedVnodeList);
1339		unusedVnodesLocker.Unlock();
1340
1341		if (vnode == NULL)
1342			break;
1343
1344		// lock the node
1345		AutoLocker<Vnode> nodeLocker(vnode);
1346
1347		// Check whether the node is still unused -- since we only append to the
1348		// tail of the unused queue, the vnode should still be at its head.
1349		// Alternatively we could check its ref count for 0 and its busy flag,
1350		// but if the node is no longer at the head of the queue, it means it
1351		// has been touched in the meantime, i.e. it is no longer the least
1352		// recently used unused vnode and we rather don't free it.
1353		unusedVnodesLocker.Lock();
1354		if (vnode != list_get_first_item(&sUnusedVnodeList))
1355			continue;
1356		unusedVnodesLocker.Unlock();
1357
1358		ASSERT(!vnode->IsBusy());
1359
1360		// grab a reference
1361		inc_vnode_ref_count(vnode);
1362		vnode_used(vnode);
1363
1364		// write back changes and free the node
1365		nodeLocker.Unlock();
1366		vnodesReadLocker.Unlock();
1367
1368		if (vnode->cache != NULL)
1369			vnode->cache->WriteModified();
1370
1371		dec_vnode_ref_count(vnode, true, false);
1372			// this should free the vnode when it's still unused
1373	}
1374
1375	unused_vnodes_check_done();
1376}
1377
1378
1379/*!	Gets the vnode the given vnode is covering.
1380
1381	The caller must have \c sVnodeLock read-locked at least.
1382
1383	The function returns a reference to the retrieved vnode (if any), the caller
1384	is responsible to free.
1385
1386	\param vnode The vnode whose covered node shall be returned.
1387	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1388		vnode.
1389*/
1390static inline Vnode*
1391get_covered_vnode_locked(Vnode* vnode)
1392{
1393	if (Vnode* coveredNode = vnode->covers) {
1394		while (coveredNode->covers != NULL)
1395			coveredNode = coveredNode->covers;
1396
1397		inc_vnode_ref_count(coveredNode);
1398		return coveredNode;
1399	}
1400
1401	return NULL;
1402}
1403
1404
1405/*!	Gets the vnode the given vnode is covering.
1406
1407	The caller must not hold \c sVnodeLock. Note that this implies a race
1408	condition, since the situation can change at any time.
1409
1410	The function returns a reference to the retrieved vnode (if any), the caller
1411	is responsible to free.
1412
1413	\param vnode The vnode whose covered node shall be returned.
1414	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1415		vnode.
1416*/
1417static inline Vnode*
1418get_covered_vnode(Vnode* vnode)
1419{
1420	if (!vnode->IsCovering())
1421		return NULL;
1422
1423	ReadLocker vnodeReadLocker(sVnodeLock);
1424	return get_covered_vnode_locked(vnode);
1425}
1426
1427
1428/*!	Gets the vnode the given vnode is covered by.
1429
1430	The caller must have \c sVnodeLock read-locked at least.
1431
1432	The function returns a reference to the retrieved vnode (if any), the caller
1433	is responsible to free.
1434
1435	\param vnode The vnode whose covering node shall be returned.
1436	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1437		any vnode.
1438*/
1439static Vnode*
1440get_covering_vnode_locked(Vnode* vnode)
1441{
1442	if (Vnode* coveringNode = vnode->covered_by) {
1443		while (coveringNode->covered_by != NULL)
1444			coveringNode = coveringNode->covered_by;
1445
1446		inc_vnode_ref_count(coveringNode);
1447		return coveringNode;
1448	}
1449
1450	return NULL;
1451}
1452
1453
1454/*!	Gets the vnode the given vnode is covered by.
1455
1456	The caller must not hold \c sVnodeLock. Note that this implies a race
1457	condition, since the situation can change at any time.
1458
1459	The function returns a reference to the retrieved vnode (if any), the caller
1460	is responsible to free.
1461
1462	\param vnode The vnode whose covering node shall be returned.
1463	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1464		any vnode.
1465*/
1466static inline Vnode*
1467get_covering_vnode(Vnode* vnode)
1468{
1469	if (!vnode->IsCovered())
1470		return NULL;
1471
1472	ReadLocker vnodeReadLocker(sVnodeLock);
1473	return get_covering_vnode_locked(vnode);
1474}
1475
1476
1477static void
1478free_unused_vnodes()
1479{
1480	free_unused_vnodes(
1481		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1482			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1483}
1484
1485
1486static void
1487vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1488{
1489	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1490
1491	free_unused_vnodes(level);
1492}
1493
1494
1495static inline void
1496put_advisory_locking(struct advisory_locking* locking)
1497{
1498	release_sem(locking->lock);
1499}
1500
1501
1502/*!	Returns the advisory_locking object of the \a vnode in case it
1503	has one, and locks it.
1504	You have to call put_advisory_locking() when you're done with
1505	it.
1506	Note, you must not have the vnode mutex locked when calling
1507	this function.
1508*/
1509static struct advisory_locking*
1510get_advisory_locking(struct vnode* vnode)
1511{
1512	rw_lock_read_lock(&sVnodeLock);
1513	vnode->Lock();
1514
1515	struct advisory_locking* locking = vnode->advisory_locking;
1516	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1517
1518	vnode->Unlock();
1519	rw_lock_read_unlock(&sVnodeLock);
1520
1521	if (lock >= 0)
1522		lock = acquire_sem(lock);
1523	if (lock < 0) {
1524		// This means the locking has been deleted in the mean time
1525		// or had never existed in the first place - otherwise, we
1526		// would get the lock at some point.
1527		return NULL;
1528	}
1529
1530	return locking;
1531}
1532
1533
1534/*!	Creates a locked advisory_locking object, and attaches it to the
1535	given \a vnode.
1536	Returns B_OK in case of success - also if the vnode got such an
1537	object from someone else in the mean time, you'll still get this
1538	one locked then.
1539*/
1540static status_t
1541create_advisory_locking(struct vnode* vnode)
1542{
1543	if (vnode == NULL)
1544		return B_FILE_ERROR;
1545
1546	ObjectDeleter<advisory_locking> lockingDeleter;
1547	struct advisory_locking* locking = NULL;
1548
1549	while (get_advisory_locking(vnode) == NULL) {
1550		// no locking object set on the vnode yet, create one
1551		if (locking == NULL) {
1552			locking = new(std::nothrow) advisory_locking;
1553			if (locking == NULL)
1554				return B_NO_MEMORY;
1555			lockingDeleter.SetTo(locking);
1556
1557			locking->wait_sem = create_sem(0, "advisory lock");
1558			if (locking->wait_sem < 0)
1559				return locking->wait_sem;
1560
1561			locking->lock = create_sem(0, "advisory locking");
1562			if (locking->lock < 0)
1563				return locking->lock;
1564		}
1565
1566		// set our newly created locking object
1567		ReadLocker _(sVnodeLock);
1568		AutoLocker<Vnode> nodeLocker(vnode);
1569		if (vnode->advisory_locking == NULL) {
1570			vnode->advisory_locking = locking;
1571			lockingDeleter.Detach();
1572			return B_OK;
1573		}
1574	}
1575
1576	// The vnode already had a locking object. That's just as well.
1577
1578	return B_OK;
1579}
1580
1581
1582/*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1583	with the advisory_lock \a lock.
1584*/
1585static bool
1586advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1587{
1588	if (flock == NULL)
1589		return true;
1590
1591	return lock->start <= flock->l_start - 1 + flock->l_len
1592		&& lock->end >= flock->l_start;
1593}
1594
1595
1596/*!	Tests whether acquiring a lock would block.
1597*/
1598static status_t
1599test_advisory_lock(struct vnode* vnode, struct flock* flock)
1600{
1601	flock->l_type = F_UNLCK;
1602
1603	struct advisory_locking* locking = get_advisory_locking(vnode);
1604	if (locking == NULL)
1605		return B_OK;
1606
1607	team_id team = team_get_current_team_id();
1608
1609	LockList::Iterator iterator = locking->locks.GetIterator();
1610	while (iterator.HasNext()) {
1611		struct advisory_lock* lock = iterator.Next();
1612
1613		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1614			// locks do overlap
1615			if (flock->l_type != F_RDLCK || !lock->shared) {
1616				// collision
1617				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1618				flock->l_whence = SEEK_SET;
1619				flock->l_start = lock->start;
1620				flock->l_len = lock->end - lock->start + 1;
1621				flock->l_pid = lock->team;
1622				break;
1623			}
1624		}
1625	}
1626
1627	put_advisory_locking(locking);
1628	return B_OK;
1629}
1630
1631
1632/*!	Removes the specified lock, or all locks of the calling team
1633	if \a flock is NULL.
1634*/
1635static status_t
1636release_advisory_lock(struct vnode* vnode, struct io_context* context,
1637	struct file_descriptor* descriptor, struct flock* flock)
1638{
1639	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1640
1641	struct advisory_locking* locking = get_advisory_locking(vnode);
1642	if (locking == NULL)
1643		return B_OK;
1644
1645	// find matching lock entries
1646
1647	LockList::Iterator iterator = locking->locks.GetIterator();
1648	while (iterator.HasNext()) {
1649		struct advisory_lock* lock = iterator.Next();
1650		bool removeLock = false;
1651
1652		if (descriptor != NULL && lock->bound_to == descriptor) {
1653			// Remove flock() locks
1654			removeLock = true;
1655		} else if (lock->bound_to == context
1656				&& advisory_lock_intersects(lock, flock)) {
1657			// Remove POSIX locks
1658			bool endsBeyond = false;
1659			bool startsBefore = false;
1660			if (flock != NULL) {
1661				startsBefore = lock->start < flock->l_start;
1662				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1663			}
1664
1665			if (!startsBefore && !endsBeyond) {
1666				// lock is completely contained in flock
1667				removeLock = true;
1668			} else if (startsBefore && !endsBeyond) {
1669				// cut the end of the lock
1670				lock->end = flock->l_start - 1;
1671			} else if (!startsBefore && endsBeyond) {
1672				// cut the start of the lock
1673				lock->start = flock->l_start + flock->l_len;
1674			} else {
1675				// divide the lock into two locks
1676				struct advisory_lock* secondLock = new advisory_lock;
1677				if (secondLock == NULL) {
1678					// TODO: we should probably revert the locks we already
1679					// changed... (ie. allocate upfront)
1680					put_advisory_locking(locking);
1681					return B_NO_MEMORY;
1682				}
1683
1684				lock->end = flock->l_start - 1;
1685
1686				secondLock->bound_to = context;
1687				secondLock->team = lock->team;
1688				secondLock->session = lock->session;
1689				// values must already be normalized when getting here
1690				secondLock->start = flock->l_start + flock->l_len;
1691				secondLock->end = lock->end;
1692				secondLock->shared = lock->shared;
1693
1694				locking->locks.Add(secondLock);
1695			}
1696		}
1697
1698		if (removeLock) {
1699			// this lock is no longer used
1700			iterator.Remove();
1701			free(lock);
1702		}
1703	}
1704
1705	bool removeLocking = locking->locks.IsEmpty();
1706	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1707
1708	put_advisory_locking(locking);
1709
1710	if (removeLocking) {
1711		// We can remove the whole advisory locking structure; it's no
1712		// longer used
1713		locking = get_advisory_locking(vnode);
1714		if (locking != NULL) {
1715			ReadLocker locker(sVnodeLock);
1716			AutoLocker<Vnode> nodeLocker(vnode);
1717
1718			// the locking could have been changed in the mean time
1719			if (locking->locks.IsEmpty()) {
1720				vnode->advisory_locking = NULL;
1721				nodeLocker.Unlock();
1722				locker.Unlock();
1723
1724				// we've detached the locking from the vnode, so we can
1725				// safely delete it
1726				delete locking;
1727			} else {
1728				// the locking is in use again
1729				nodeLocker.Unlock();
1730				locker.Unlock();
1731				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1732			}
1733		}
1734	}
1735
1736	return B_OK;
1737}
1738
1739
1740/*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1741	will wait for the lock to become available, if there are any collisions
1742	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1743
1744	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1745	BSD flock() semantics are used, that is, all children can unlock the file
1746	in question (we even allow parents to remove the lock, though, but that
1747	seems to be in line to what the BSD's are doing).
1748*/
1749static status_t
1750acquire_advisory_lock(struct vnode* vnode, io_context* context,
1751	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1752{
1753	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1754		vnode, flock, wait ? "yes" : "no"));
1755	dprintf("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1756		vnode, flock, wait ? "yes" : "no");
1757
1758	bool shared = flock->l_type == F_RDLCK;
1759	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1760	status_t status = B_OK;
1761
1762	// TODO: do deadlock detection!
1763
1764	struct advisory_locking* locking;
1765
1766	while (true) {
1767		// if this vnode has an advisory_locking structure attached,
1768		// lock that one and search for any colliding file lock
1769		status = create_advisory_locking(vnode);
1770		if (status != B_OK)
1771			return status;
1772
1773		locking = vnode->advisory_locking;
1774		team_id team = team_get_current_team_id();
1775		sem_id waitForLock = -1;
1776
1777		// test for collisions
1778		LockList::Iterator iterator = locking->locks.GetIterator();
1779		while (iterator.HasNext()) {
1780			struct advisory_lock* lock = iterator.Next();
1781
1782			// TODO: locks from the same team might be joinable!
1783			if ((lock->team != team || lock->bound_to != boundTo)
1784					&& advisory_lock_intersects(lock, flock)) {
1785				// locks do overlap
1786				if (!shared || !lock->shared) {
1787					// we need to wait
1788					waitForLock = locking->wait_sem;
1789					break;
1790				}
1791			}
1792		}
1793
1794		if (waitForLock < 0)
1795			break;
1796
1797		// We need to wait. Do that or fail now, if we've been asked not to.
1798
1799		if (!wait) {
1800			put_advisory_locking(locking);
1801			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1802		}
1803
1804		status = switch_sem_etc(locking->lock, waitForLock, 1,
1805			B_CAN_INTERRUPT, 0);
1806		if (status != B_OK && status != B_BAD_SEM_ID)
1807			return status;
1808
1809		// We have been notified, but we need to re-lock the locking object. So
1810		// go another round...
1811	}
1812
1813	// install new lock
1814
1815	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1816		sizeof(struct advisory_lock));
1817	if (lock == NULL) {
1818		put_advisory_locking(locking);
1819		return B_NO_MEMORY;
1820	}
1821
1822	lock->bound_to = boundTo;
1823	lock->team = team_get_current_team_id();
1824	lock->session = thread_get_current_thread()->team->session_id;
1825	// values must already be normalized when getting here
1826	lock->start = flock->l_start;
1827	lock->end = flock->l_start - 1 + flock->l_len;
1828	lock->shared = shared;
1829
1830	locking->locks.Add(lock);
1831	put_advisory_locking(locking);
1832
1833	return status;
1834}
1835
1836
1837/*!	Normalizes the \a flock structure to make it easier to compare the
1838	structure with others. The l_start and l_len fields are set to absolute
1839	values according to the l_whence field.
1840*/
1841static status_t
1842normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1843{
1844	switch (flock->l_whence) {
1845		case SEEK_SET:
1846			break;
1847		case SEEK_CUR:
1848			flock->l_start += descriptor->pos;
1849			break;
1850		case SEEK_END:
1851		{
1852			struct vnode* vnode = descriptor->u.vnode;
1853			struct stat stat;
1854			status_t status;
1855
1856			if (!HAS_FS_CALL(vnode, read_stat))
1857				return B_UNSUPPORTED;
1858
1859			status = FS_CALL(vnode, read_stat, &stat);
1860			if (status != B_OK)
1861				return status;
1862
1863			flock->l_start += stat.st_size;
1864			break;
1865		}
1866		default:
1867			return B_BAD_VALUE;
1868	}
1869
1870	if (flock->l_start < 0)
1871		flock->l_start = 0;
1872	if (flock->l_len == 0)
1873		flock->l_len = OFF_MAX;
1874
1875	// don't let the offset and length overflow
1876	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1877		flock->l_len = OFF_MAX - flock->l_start;
1878
1879	if (flock->l_len < 0) {
1880		// a negative length reverses the region
1881		flock->l_start += flock->l_len;
1882		flock->l_len = -flock->l_len;
1883	}
1884
1885	return B_OK;
1886}
1887
1888
1889static void
1890replace_vnode_if_disconnected(struct fs_mount* mount,
1891	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1892	struct vnode* fallBack, bool lockRootLock)
1893{
1894	struct vnode* givenVnode = vnode;
1895	bool vnodeReplaced = false;
1896
1897	ReadLocker vnodeReadLocker(sVnodeLock);
1898
1899	if (lockRootLock)
1900		mutex_lock(&sIOContextRootLock);
1901
1902	while (vnode != NULL && vnode->mount == mount
1903		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1904		if (vnode->covers != NULL) {
1905			// redirect the vnode to the covered vnode
1906			vnode = vnode->covers;
1907		} else
1908			vnode = fallBack;
1909
1910		vnodeReplaced = true;
1911	}
1912
1913	// If we've replaced the node, grab a reference for the new one.
1914	if (vnodeReplaced && vnode != NULL)
1915		inc_vnode_ref_count(vnode);
1916
1917	if (lockRootLock)
1918		mutex_unlock(&sIOContextRootLock);
1919
1920	vnodeReadLocker.Unlock();
1921
1922	if (vnodeReplaced)
1923		put_vnode(givenVnode);
1924}
1925
1926
1927/*!	Disconnects all file descriptors that are associated with the
1928	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1929	\a mount object.
1930
1931	Note, after you've called this function, there might still be ongoing
1932	accesses - they won't be interrupted if they already happened before.
1933	However, any subsequent access will fail.
1934
1935	This is not a cheap function and should be used with care and rarely.
1936	TODO: there is currently no means to stop a blocking read/write!
1937*/
1938static void
1939disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1940	struct vnode* vnodeToDisconnect)
1941{
1942	// iterate over all teams and peek into their file descriptors
1943	TeamListIterator teamIterator;
1944	while (Team* team = teamIterator.Next()) {
1945		BReference<Team> teamReference(team, true);
1946		TeamLocker teamLocker(team);
1947
1948		// lock the I/O context
1949		io_context* context = team->io_context;
1950		if (context == NULL)
1951			continue;
1952		MutexLocker contextLocker(context->io_mutex);
1953
1954		teamLocker.Unlock();
1955
1956		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1957			sRoot, true);
1958		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1959			sRoot, false);
1960
1961		for (uint32 i = 0; i < context->table_size; i++) {
1962			struct file_descriptor* descriptor = context->fds[i];
1963			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1964				continue;
1965
1966			inc_fd_ref_count(descriptor);
1967
1968			// if this descriptor points at this mount, we
1969			// need to disconnect it to be able to unmount
1970			struct vnode* vnode = fd_vnode(descriptor);
1971			if (vnodeToDisconnect != NULL) {
1972				if (vnode == vnodeToDisconnect)
1973					disconnect_fd(descriptor);
1974			} else if ((vnode != NULL && vnode->mount == mount)
1975				|| (vnode == NULL && descriptor->u.mount == mount))
1976				disconnect_fd(descriptor);
1977
1978			put_fd(descriptor);
1979		}
1980	}
1981}
1982
1983
1984/*!	\brief Gets the root node of the current IO context.
1985	If \a kernel is \c true, the kernel IO context will be used.
1986	The caller obtains a reference to the returned node.
1987*/
1988struct vnode*
1989get_root_vnode(bool kernel)
1990{
1991	if (!kernel) {
1992		// Get current working directory from io context
1993		struct io_context* context = get_current_io_context(kernel);
1994
1995		mutex_lock(&sIOContextRootLock);
1996
1997		struct vnode* root = context->root;
1998		if (root != NULL)
1999			inc_vnode_ref_count(root);
2000
2001		mutex_unlock(&sIOContextRootLock);
2002
2003		if (root != NULL)
2004			return root;
2005
2006		// That should never happen.
2007		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
2008			"have a root\n", team_get_current_team_id());
2009	}
2010
2011	inc_vnode_ref_count(sRoot);
2012	return sRoot;
2013}
2014
2015
2016/*!	\brief Gets the directory path and leaf name for a given path.
2017
2018	The supplied \a path is transformed to refer to the directory part of
2019	the entry identified by the original path, and into the buffer \a filename
2020	the leaf name of the original entry is written.
2021	Neither the returned path nor the leaf name can be expected to be
2022	canonical.
2023
2024	\param path The path to be analyzed. Must be able to store at least one
2025		   additional character.
2026	\param filename The buffer into which the leaf name will be written.
2027		   Must be of size B_FILE_NAME_LENGTH at least.
2028	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2029		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2030		   if the given path name is empty.
2031*/
2032static status_t
2033get_dir_path_and_leaf(char* path, char* filename)
2034{
2035	if (*path == '\0')
2036		return B_ENTRY_NOT_FOUND;
2037
2038	char* last = strrchr(path, '/');
2039		// '/' are not allowed in file names!
2040
2041	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2042
2043	if (last == NULL) {
2044		// this path is single segment with no '/' in it
2045		// ex. "foo"
2046		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2047			return B_NAME_TOO_LONG;
2048
2049		strcpy(path, ".");
2050	} else {
2051		last++;
2052		if (last[0] == '\0') {
2053			// special case: the path ends in one or more '/' - remove them
2054			while (*--last == '/' && last != path);
2055			last[1] = '\0';
2056
2057			if (last == path && last[0] == '/') {
2058				// This path points to the root of the file system
2059				strcpy(filename, ".");
2060				return B_OK;
2061			}
2062			for (; last != path && *(last - 1) != '/'; last--);
2063				// rewind to the start of the leaf before the '/'
2064		}
2065
2066		// normal leaf: replace the leaf portion of the path with a '.'
2067		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2068			return B_NAME_TOO_LONG;
2069
2070		last[0] = '.';
2071		last[1] = '\0';
2072	}
2073	return B_OK;
2074}
2075
2076
2077static status_t
2078entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2079	bool traverse, bool kernel, struct vnode** _vnode)
2080{
2081	char clonedName[B_FILE_NAME_LENGTH + 1];
2082	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2083		return B_NAME_TOO_LONG;
2084
2085	// get the directory vnode and let vnode_path_to_vnode() do the rest
2086	struct vnode* directory;
2087
2088	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2089	if (status < 0)
2090		return status;
2091
2092	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2093		_vnode, NULL);
2094}
2095
2096
2097/*!	Looks up the entry with name \a name in the directory represented by \a dir
2098	and returns the respective vnode.
2099	On success a reference to the vnode is acquired for the caller.
2100*/
2101static status_t
2102lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2103{
2104	ino_t id;
2105	bool missing;
2106
2107	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2108		return missing ? B_ENTRY_NOT_FOUND
2109			: get_vnode(dir->device, id, _vnode, true, false);
2110	}
2111
2112	status_t status = FS_CALL(dir, lookup, name, &id);
2113	if (status != B_OK)
2114		return status;
2115
2116	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2117	// have a reference and just need to look the node up.
2118	rw_lock_read_lock(&sVnodeLock);
2119	*_vnode = lookup_vnode(dir->device, id);
2120	rw_lock_read_unlock(&sVnodeLock);
2121
2122	if (*_vnode == NULL) {
2123		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2124			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2125		return B_ENTRY_NOT_FOUND;
2126	}
2127
2128//	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2129//		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2130//		(*_vnode)->mount->id, (*_vnode)->id);
2131
2132	return B_OK;
2133}
2134
2135
2136/*!	Returns the vnode for the relative path starting at the specified \a vnode.
2137	\a path must not be NULL.
2138	If it returns successfully, \a path contains the name of the last path
2139	component. This function clobbers the buffer pointed to by \a path only
2140	if it does contain more than one component.
2141	Note, this reduces the ref_count of the starting \a vnode, no matter if
2142	it is successful or not!
2143*/
2144static status_t
2145vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2146	int count, struct io_context* ioContext, struct vnode** _vnode,
2147	ino_t* _parentID)
2148{
2149	status_t status = B_OK;
2150	ino_t lastParentID = vnode->id;
2151
2152	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2153
2154	if (path == NULL) {
2155		put_vnode(vnode);
2156		return B_BAD_VALUE;
2157	}
2158
2159	if (*path == '\0') {
2160		put_vnode(vnode);
2161		return B_ENTRY_NOT_FOUND;
2162	}
2163
2164	while (true) {
2165		struct vnode* nextVnode;
2166		char* nextPath;
2167
2168		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2169			path));
2170
2171		// done?
2172		if (path[0] == '\0')
2173			break;
2174
2175		// walk to find the next path component ("path" will point to a single
2176		// path component), and filter out multiple slashes
2177		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2178				nextPath++);
2179
2180		if (*nextPath == '/') {
2181			*nextPath = '\0';
2182			do
2183				nextPath++;
2184			while (*nextPath == '/');
2185		}
2186
2187		// See if the '..' is at a covering vnode move to the covered
2188		// vnode so we pass the '..' path to the underlying filesystem.
2189		// Also prevent breaking the root of the IO context.
2190		if (strcmp("..", path) == 0) {
2191			if (vnode == ioContext->root) {
2192				// Attempted prison break! Keep it contained.
2193				path = nextPath;
2194				continue;
2195			}
2196
2197			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2198				nextVnode = coveredVnode;
2199				put_vnode(vnode);
2200				vnode = nextVnode;
2201			}
2202		}
2203
2204		// check if vnode is really a directory
2205		if (status == B_OK && !S_ISDIR(vnode->Type()))
2206			status = B_NOT_A_DIRECTORY;
2207
2208		// Check if we have the right to search the current directory vnode.
2209		// If a file system doesn't have the access() function, we assume that
2210		// searching a directory is always allowed
2211		if (status == B_OK && HAS_FS_CALL(vnode, access))
2212			status = FS_CALL(vnode, access, X_OK);
2213
2214		// Tell the filesystem to get the vnode of this path component (if we
2215		// got the permission from the call above)
2216		if (status == B_OK)
2217			status = lookup_dir_entry(vnode, path, &nextVnode);
2218
2219		if (status != B_OK) {
2220			put_vnode(vnode);
2221			return status;
2222		}
2223
2224		// If the new node is a symbolic link, resolve it (if we've been told
2225		// to do it)
2226		if (S_ISLNK(nextVnode->Type())
2227			&& (traverseLeafLink || nextPath[0] != '\0')) {
2228			size_t bufferSize;
2229			char* buffer;
2230
2231			TRACE(("traverse link\n"));
2232
2233			// it's not exactly nice style using goto in this way, but hey,
2234			// it works :-/
2235			if (count + 1 > B_MAX_SYMLINKS) {
2236				status = B_LINK_LIMIT;
2237				goto resolve_link_error;
2238			}
2239
2240			bufferSize = B_PATH_NAME_LENGTH;
2241			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2242			if (buffer == NULL) {
2243				status = B_NO_MEMORY;
2244				goto resolve_link_error;
2245			}
2246
2247			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2248				bufferSize--;
2249				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2250				// null-terminate
2251				if (status >= 0)
2252					buffer[bufferSize] = '\0';
2253			} else
2254				status = B_BAD_VALUE;
2255
2256			if (status != B_OK) {
2257				free(buffer);
2258
2259		resolve_link_error:
2260				put_vnode(vnode);
2261				put_vnode(nextVnode);
2262
2263				return status;
2264			}
2265			put_vnode(nextVnode);
2266
2267			// Check if we start from the root directory or the current
2268			// directory ("vnode" still points to that one).
2269			// Cut off all leading slashes if it's the root directory
2270			path = buffer;
2271			bool absoluteSymlink = false;
2272			if (path[0] == '/') {
2273				// we don't need the old directory anymore
2274				put_vnode(vnode);
2275
2276				while (*++path == '/')
2277					;
2278
2279				mutex_lock(&sIOContextRootLock);
2280				vnode = ioContext->root;
2281				inc_vnode_ref_count(vnode);
2282				mutex_unlock(&sIOContextRootLock);
2283
2284				absoluteSymlink = true;
2285			}
2286
2287			inc_vnode_ref_count(vnode);
2288				// balance the next recursion - we will decrement the
2289				// ref_count of the vnode, no matter if we succeeded or not
2290
2291			if (absoluteSymlink && *path == '\0') {
2292				// symlink was just "/"
2293				nextVnode = vnode;
2294			} else {
2295				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2296					ioContext, &nextVnode, &lastParentID);
2297			}
2298
2299			object_cache_free(sPathNameCache, buffer, 0);
2300
2301			if (status != B_OK) {
2302				put_vnode(vnode);
2303				return status;
2304			}
2305		} else
2306			lastParentID = vnode->id;
2307
2308		// decrease the ref count on the old dir we just looked up into
2309		put_vnode(vnode);
2310
2311		path = nextPath;
2312		vnode = nextVnode;
2313
2314		// see if we hit a covered node
2315		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2316			put_vnode(vnode);
2317			vnode = coveringNode;
2318		}
2319	}
2320
2321	*_vnode = vnode;
2322	if (_parentID)
2323		*_parentID = lastParentID;
2324
2325	return B_OK;
2326}
2327
2328
2329static status_t
2330vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2331	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2332{
2333	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2334		get_current_io_context(kernel), _vnode, _parentID);
2335}
2336
2337
2338static status_t
2339path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2340	ino_t* _parentID, bool kernel)
2341{
2342	struct vnode* start = NULL;
2343
2344	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2345
2346	if (!path)
2347		return B_BAD_VALUE;
2348
2349	if (*path == '\0')
2350		return B_ENTRY_NOT_FOUND;
2351
2352	// figure out if we need to start at root or at cwd
2353	if (*path == '/') {
2354		if (sRoot == NULL) {
2355			// we're a bit early, aren't we?
2356			return B_ERROR;
2357		}
2358
2359		while (*++path == '/')
2360			;
2361		start = get_root_vnode(kernel);
2362
2363		if (*path == '\0') {
2364			*_vnode = start;
2365			return B_OK;
2366		}
2367
2368	} else {
2369		struct io_context* context = get_current_io_context(kernel);
2370
2371		mutex_lock(&context->io_mutex);
2372		start = context->cwd;
2373		if (start != NULL)
2374			inc_vnode_ref_count(start);
2375		mutex_unlock(&context->io_mutex);
2376
2377		if (start == NULL)
2378			return B_ERROR;
2379	}
2380
2381	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2382		_parentID);
2383}
2384
2385
2386/*! Returns the vnode in the next to last segment of the path, and returns
2387	the last portion in filename.
2388	The path buffer must be able to store at least one additional character.
2389*/
2390static status_t
2391path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2392	bool kernel)
2393{
2394	status_t status = get_dir_path_and_leaf(path, filename);
2395	if (status != B_OK)
2396		return status;
2397
2398	return path_to_vnode(path, true, _vnode, NULL, kernel);
2399}
2400
2401
2402/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2403		   to by a FD + path pair.
2404
2405	\a path must be given in either case. \a fd might be omitted, in which
2406	case \a path is either an absolute path or one relative to the current
2407	directory. If both a supplied and \a path is relative it is reckoned off
2408	of the directory referred to by \a fd. If \a path is absolute \a fd is
2409	ignored.
2410
2411	The caller has the responsibility to call put_vnode() on the returned
2412	directory vnode.
2413
2414	\param fd The FD. May be < 0.
2415	\param path The absolute or relative path. Must not be \c NULL. The buffer
2416	       is modified by this function. It must have at least room for a
2417	       string one character longer than the path it contains.
2418	\param _vnode A pointer to a variable the directory vnode shall be written
2419		   into.
2420	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2421		   the leaf name of the specified entry will be written.
2422	\param kernel \c true, if invoked from inside the kernel, \c false if
2423		   invoked from userland.
2424	\return \c B_OK, if everything went fine, another error code otherwise.
2425*/
2426static status_t
2427fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2428	char* filename, bool kernel)
2429{
2430	if (!path)
2431		return B_BAD_VALUE;
2432	if (*path == '\0')
2433		return B_ENTRY_NOT_FOUND;
2434	if (fd < 0)
2435		return path_to_dir_vnode(path, _vnode, filename, kernel);
2436
2437	status_t status = get_dir_path_and_leaf(path, filename);
2438	if (status != B_OK)
2439		return status;
2440
2441	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2442}
2443
2444
2445/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2446		   to by a vnode + path pair.
2447
2448	\a path must be given in either case. \a vnode might be omitted, in which
2449	case \a path is either an absolute path or one relative to the current
2450	directory. If both a supplied and \a path is relative it is reckoned off
2451	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2452	ignored.
2453
2454	The caller has the responsibility to call put_vnode() on the returned
2455	directory vnode.
2456
2457	\param vnode The vnode. May be \c NULL.
2458	\param path The absolute or relative path. Must not be \c NULL. The buffer
2459	       is modified by this function. It must have at least room for a
2460	       string one character longer than the path it contains.
2461	\param _vnode A pointer to a variable the directory vnode shall be written
2462		   into.
2463	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2464		   the leaf name of the specified entry will be written.
2465	\param kernel \c true, if invoked from inside the kernel, \c false if
2466		   invoked from userland.
2467	\return \c B_OK, if everything went fine, another error code otherwise.
2468*/
2469static status_t
2470vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2471	struct vnode** _vnode, char* filename, bool kernel)
2472{
2473	if (!path)
2474		return B_BAD_VALUE;
2475	if (*path == '\0')
2476		return B_ENTRY_NOT_FOUND;
2477	if (vnode == NULL || path[0] == '/')
2478		return path_to_dir_vnode(path, _vnode, filename, kernel);
2479
2480	status_t status = get_dir_path_and_leaf(path, filename);
2481	if (status != B_OK)
2482		return status;
2483
2484	inc_vnode_ref_count(vnode);
2485		// vnode_path_to_vnode() always decrements the ref count
2486
2487	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2488}
2489
2490
2491/*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2492*/
2493static status_t
2494get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2495	size_t bufferSize, struct io_context* ioContext)
2496{
2497	if (bufferSize < sizeof(struct dirent))
2498		return B_BAD_VALUE;
2499
2500	// See if the vnode is covering another vnode and move to the covered
2501	// vnode so we get the underlying file system
2502	VNodePutter vnodePutter;
2503	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2504		vnode = coveredVnode;
2505		vnodePutter.SetTo(vnode);
2506	}
2507
2508	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2509		// The FS supports getting the name of a vnode.
2510		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2511			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2512			return B_OK;
2513	}
2514
2515	// The FS doesn't support getting the name of a vnode. So we search the
2516	// parent directory for the vnode, if the caller let us.
2517
2518	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2519		return B_UNSUPPORTED;
2520
2521	void* cookie;
2522
2523	status_t status = FS_CALL(parent, open_dir, &cookie);
2524	if (status >= B_OK) {
2525		while (true) {
2526			uint32 num = 1;
2527			// We use the FS hook directly instead of dir_read(), since we don't
2528			// want the entries to be fixed. We have already resolved vnode to
2529			// the covered node.
2530			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2531				&num);
2532			if (status != B_OK)
2533				break;
2534			if (num == 0) {
2535				status = B_ENTRY_NOT_FOUND;
2536				break;
2537			}
2538
2539			if (vnode->id == buffer->d_ino) {
2540				// found correct entry!
2541				break;
2542			}
2543		}
2544
2545		FS_CALL(parent, close_dir, cookie);
2546		FS_CALL(parent, free_dir_cookie, cookie);
2547	}
2548	return status;
2549}
2550
2551
2552static status_t
2553get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2554	size_t nameSize, bool kernel)
2555{
2556	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2557	struct dirent* dirent = (struct dirent*)buffer;
2558
2559	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2560		get_current_io_context(kernel));
2561	if (status != B_OK)
2562		return status;
2563
2564	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2565		return B_BUFFER_OVERFLOW;
2566
2567	return B_OK;
2568}
2569
2570
2571/*!	Gets the full path to a given directory vnode.
2572	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2573	file system doesn't support this call, it will fall back to iterating
2574	through the parent directory to get the name of the child.
2575
2576	To protect against circular loops, it supports a maximum tree depth
2577	of 256 levels.
2578
2579	Note that the path may not be correct the time this function returns!
2580	It doesn't use any locking to prevent returning the correct path, as
2581	paths aren't safe anyway: the path to a file can change at any time.
2582
2583	It might be a good idea, though, to check if the returned path exists
2584	in the calling function (it's not done here because of efficiency)
2585*/
2586static status_t
2587dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2588	bool kernel)
2589{
2590	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2591
2592	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2593		return B_BAD_VALUE;
2594
2595	if (!S_ISDIR(vnode->Type()))
2596		return B_NOT_A_DIRECTORY;
2597
2598	char* path = buffer;
2599	int32 insert = bufferSize;
2600	int32 maxLevel = 256;
2601	int32 length;
2602	status_t status = B_OK;
2603	struct io_context* ioContext = get_current_io_context(kernel);
2604
2605	// we don't use get_vnode() here because this call is more
2606	// efficient and does all we need from get_vnode()
2607	inc_vnode_ref_count(vnode);
2608
2609	path[--insert] = '\0';
2610		// the path is filled right to left
2611
2612	while (true) {
2613		// If the node is the context's root, bail out. Otherwise resolve mount
2614		// points.
2615		if (vnode == ioContext->root)
2616			break;
2617
2618		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2619			put_vnode(vnode);
2620			vnode = coveredVnode;
2621		}
2622
2623		// lookup the parent vnode
2624		struct vnode* parentVnode;
2625		status = lookup_dir_entry(vnode, "..", &parentVnode);
2626		if (status != B_OK)
2627			goto out;
2628
2629		if (parentVnode == vnode) {
2630			// The caller apparently got their hands on a node outside of their
2631			// context's root. Now we've hit the global root.
2632			put_vnode(parentVnode);
2633			break;
2634		}
2635
2636		// get the node's name
2637		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2638			// also used for fs_read_dir()
2639		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2640		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2641			sizeof(nameBuffer), ioContext);
2642
2643		// release the current vnode, we only need its parent from now on
2644		put_vnode(vnode);
2645		vnode = parentVnode;
2646
2647		if (status != B_OK)
2648			goto out;
2649
2650		// TODO: add an explicit check for loops in about 10 levels to do
2651		// real loop detection
2652
2653		// don't go deeper as 'maxLevel' to prevent circular loops
2654		if (maxLevel-- < 0) {
2655			status = B_LINK_LIMIT;
2656			goto out;
2657		}
2658
2659		// add the name in front of the current path
2660		name[B_FILE_NAME_LENGTH - 1] = '\0';
2661		length = strlen(name);
2662		insert -= length;
2663		if (insert <= 0) {
2664			status = B_RESULT_NOT_REPRESENTABLE;
2665			goto out;
2666		}
2667		memcpy(path + insert, name, length);
2668		path[--insert] = '/';
2669	}
2670
2671	// the root dir will result in an empty path: fix it
2672	if (path[insert] == '\0')
2673		path[--insert] = '/';
2674
2675	TRACE(("  path is: %s\n", path + insert));
2676
2677	// move the path to the start of the buffer
2678	length = bufferSize - insert;
2679	memmove(buffer, path + insert, length);
2680
2681out:
2682	put_vnode(vnode);
2683	return status;
2684}
2685
2686
2687/*!	Checks the length of every path component, and adds a '.'
2688	if the path ends in a slash.
2689	The given path buffer must be able to store at least one
2690	additional character.
2691*/
2692static status_t
2693check_path(char* to)
2694{
2695	int32 length = 0;
2696
2697	// check length of every path component
2698
2699	while (*to) {
2700		char* begin;
2701		if (*to == '/')
2702			to++, length++;
2703
2704		begin = to;
2705		while (*to != '/' && *to)
2706			to++, length++;
2707
2708		if (to - begin > B_FILE_NAME_LENGTH)
2709			return B_NAME_TOO_LONG;
2710	}
2711
2712	if (length == 0)
2713		return B_ENTRY_NOT_FOUND;
2714
2715	// complete path if there is a slash at the end
2716
2717	if (*(to - 1) == '/') {
2718		if (length > B_PATH_NAME_LENGTH - 2)
2719			return B_NAME_TOO_LONG;
2720
2721		to[0] = '.';
2722		to[1] = '\0';
2723	}
2724
2725	return B_OK;
2726}
2727
2728
2729static struct file_descriptor*
2730get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2731{
2732	struct file_descriptor* descriptor
2733		= get_fd(get_current_io_context(kernel), fd);
2734	if (descriptor == NULL)
2735		return NULL;
2736
2737	struct vnode* vnode = fd_vnode(descriptor);
2738	if (vnode == NULL) {
2739		put_fd(descriptor);
2740		return NULL;
2741	}
2742
2743	// ToDo: when we can close a file descriptor at any point, investigate
2744	//	if this is still valid to do (accessing the vnode without ref_count
2745	//	or locking)
2746	*_vnode = vnode;
2747	return descriptor;
2748}
2749
2750
2751static struct vnode*
2752get_vnode_from_fd(int fd, bool kernel)
2753{
2754	struct file_descriptor* descriptor;
2755	struct vnode* vnode;
2756
2757	descriptor = get_fd(get_current_io_context(kernel), fd);
2758	if (descriptor == NULL)
2759		return NULL;
2760
2761	vnode = fd_vnode(descriptor);
2762	if (vnode != NULL)
2763		inc_vnode_ref_count(vnode);
2764
2765	put_fd(descriptor);
2766	return vnode;
2767}
2768
2769
2770/*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2771	only the path will be considered. In this case, the \a path must not be
2772	NULL.
2773	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2774	and should be NULL for files.
2775*/
2776static status_t
2777fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2778	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2779{
2780	if (fd < 0 && !path)
2781		return B_BAD_VALUE;
2782
2783	if (path != NULL && *path == '\0')
2784		return B_ENTRY_NOT_FOUND;
2785
2786	if (fd < 0 || (path != NULL && path[0] == '/')) {
2787		// no FD or absolute path
2788		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2789	}
2790
2791	// FD only, or FD + relative path
2792	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2793	if (vnode == NULL)
2794		return B_FILE_ERROR;
2795
2796	if (path != NULL) {
2797		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2798			_vnode, _parentID);
2799	}
2800
2801	// there is no relative path to take into account
2802
2803	*_vnode = vnode;
2804	if (_parentID)
2805		*_parentID = -1;
2806
2807	return B_OK;
2808}
2809
2810
2811static int
2812get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2813	void* cookie, int openMode, bool kernel)
2814{
2815	struct file_descriptor* descriptor;
2816	int fd;
2817
2818	// If the vnode is locked, we don't allow creating a new file/directory
2819	// file_descriptor for it
2820	if (vnode && vnode->mandatory_locked_by != NULL
2821		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2822		return B_BUSY;
2823
2824	descriptor = alloc_fd();
2825	if (!descriptor)
2826		return B_NO_MEMORY;
2827
2828	if (vnode)
2829		descriptor->u.vnode = vnode;
2830	else
2831		descriptor->u.mount = mount;
2832	descriptor->cookie = cookie;
2833
2834	switch (type) {
2835		// vnode types
2836		case FDTYPE_FILE:
2837			descriptor->ops = &sFileOps;
2838			break;
2839		case FDTYPE_DIR:
2840			descriptor->ops = &sDirectoryOps;
2841			break;
2842		case FDTYPE_ATTR:
2843			descriptor->ops = &sAttributeOps;
2844			break;
2845		case FDTYPE_ATTR_DIR:
2846			descriptor->ops = &sAttributeDirectoryOps;
2847			break;
2848
2849		// mount types
2850		case FDTYPE_INDEX_DIR:
2851			descriptor->ops = &sIndexDirectoryOps;
2852			break;
2853		case FDTYPE_QUERY:
2854			descriptor->ops = &sQueryOps;
2855			break;
2856
2857		default:
2858			panic("get_new_fd() called with unknown type %d\n", type);
2859			break;
2860	}
2861	descriptor->type = type;
2862	descriptor->open_mode = openMode;
2863
2864	io_context* context = get_current_io_context(kernel);
2865	fd = new_fd(context, descriptor);
2866	if (fd < 0) {
2867		descriptor->ops = NULL;
2868		put_fd(descriptor);
2869		return B_NO_MORE_FDS;
2870	}
2871
2872	mutex_lock(&context->io_mutex);
2873	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2874	mutex_unlock(&context->io_mutex);
2875
2876	return fd;
2877}
2878
2879
2880/*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2881	vfs_normalize_path(). See there for more documentation.
2882*/
2883static status_t
2884normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2885{
2886	VNodePutter dirPutter;
2887	struct vnode* dir = NULL;
2888	status_t error;
2889
2890	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2891		// get dir vnode + leaf name
2892		struct vnode* nextDir;
2893		char leaf[B_FILE_NAME_LENGTH];
2894		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2895		if (error != B_OK)
2896			return error;
2897
2898		dir = nextDir;
2899		strcpy(path, leaf);
2900		dirPutter.SetTo(dir);
2901
2902		// get file vnode, if we shall resolve links
2903		bool fileExists = false;
2904		struct vnode* fileVnode;
2905		VNodePutter fileVnodePutter;
2906		if (traverseLink) {
2907			inc_vnode_ref_count(dir);
2908			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2909					NULL) == B_OK) {
2910				fileVnodePutter.SetTo(fileVnode);
2911				fileExists = true;
2912			}
2913		}
2914
2915		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2916			// we're done -- construct the path
2917			bool hasLeaf = true;
2918			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2919				// special cases "." and ".." -- get the dir, forget the leaf
2920				inc_vnode_ref_count(dir);
2921				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2922					&nextDir, NULL);
2923				if (error != B_OK)
2924					return error;
2925				dir = nextDir;
2926				dirPutter.SetTo(dir);
2927				hasLeaf = false;
2928			}
2929
2930			// get the directory path
2931			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2932			if (error != B_OK)
2933				return error;
2934
2935			// append the leaf name
2936			if (hasLeaf) {
2937				// insert a directory separator if this is not the file system
2938				// root
2939				if ((strcmp(path, "/") != 0
2940					&& strlcat(path, "/", pathSize) >= pathSize)
2941					|| strlcat(path, leaf, pathSize) >= pathSize) {
2942					return B_NAME_TOO_LONG;
2943				}
2944			}
2945
2946			return B_OK;
2947		}
2948
2949		// read link
2950		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2951			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2952			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2953			if (error != B_OK)
2954				return error;
2955			path[bufferSize] = '\0';
2956		} else
2957			return B_BAD_VALUE;
2958	}
2959
2960	return B_LINK_LIMIT;
2961}
2962
2963
2964static status_t
2965resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2966	struct io_context* ioContext)
2967{
2968	// Make sure the IO context root is not bypassed.
2969	if (parent == ioContext->root) {
2970		*_device = parent->device;
2971		*_node = parent->id;
2972		return B_OK;
2973	}
2974
2975	inc_vnode_ref_count(parent);
2976		// vnode_path_to_vnode() puts the node
2977
2978	// ".." is guaranteed not to be clobbered by this call
2979	struct vnode* vnode;
2980	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2981		ioContext, &vnode, NULL);
2982	if (status == B_OK) {
2983		*_device = vnode->device;
2984		*_node = vnode->id;
2985		put_vnode(vnode);
2986	}
2987
2988	return status;
2989}
2990
2991
2992#ifdef ADD_DEBUGGER_COMMANDS
2993
2994
2995static void
2996_dump_advisory_locking(advisory_locking* locking)
2997{
2998	if (locking == NULL)
2999		return;
3000
3001	kprintf("   lock:        %" B_PRId32, locking->lock);
3002	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
3003
3004	int32 index = 0;
3005	LockList::Iterator iterator = locking->locks.GetIterator();
3006	while (iterator.HasNext()) {
3007		struct advisory_lock* lock = iterator.Next();
3008
3009		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
3010		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
3011		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
3012		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3013	}
3014}
3015
3016
3017static void
3018_dump_mount(struct fs_mount* mount)
3019{
3020	kprintf("MOUNT: %p\n", mount);
3021	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3022	kprintf(" device_name:   %s\n", mount->device_name);
3023	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3024	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3025	kprintf(" partition:     %p\n", mount->partition);
3026	kprintf(" lock:          %p\n", &mount->lock);
3027	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3028		mount->owns_file_device ? " owns_file_device" : "");
3029
3030	fs_volume* volume = mount->volume;
3031	while (volume != NULL) {
3032		kprintf(" volume %p:\n", volume);
3033		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3034		kprintf("  private_volume:   %p\n", volume->private_volume);
3035		kprintf("  ops:              %p\n", volume->ops);
3036		kprintf("  file_system:      %p\n", volume->file_system);
3037		kprintf("  file_system_name: %s\n", volume->file_system_name);
3038		volume = volume->super_volume;
3039	}
3040
3041	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3042	set_debug_variable("_root", (addr_t)mount->root_vnode);
3043	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3044	set_debug_variable("_partition", (addr_t)mount->partition);
3045}
3046
3047
3048static bool
3049debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3050	const char* name)
3051{
3052	bool insertSlash = buffer[bufferSize] != '\0';
3053	size_t nameLength = strlen(name);
3054
3055	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3056		return false;
3057
3058	if (insertSlash)
3059		buffer[--bufferSize] = '/';
3060
3061	bufferSize -= nameLength;
3062	memcpy(buffer + bufferSize, name, nameLength);
3063
3064	return true;
3065}
3066
3067
3068static bool
3069debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3070	ino_t nodeID)
3071{
3072	if (bufferSize == 0)
3073		return false;
3074
3075	bool insertSlash = buffer[bufferSize] != '\0';
3076	if (insertSlash)
3077		buffer[--bufferSize] = '/';
3078
3079	size_t size = snprintf(buffer, bufferSize,
3080		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3081	if (size > bufferSize) {
3082		if (insertSlash)
3083			bufferSize++;
3084		return false;
3085	}
3086
3087	if (size < bufferSize)
3088		memmove(buffer + bufferSize - size, buffer, size);
3089
3090	bufferSize -= size;
3091	return true;
3092}
3093
3094
3095static char*
3096debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3097	bool& _truncated)
3098{
3099	// null-terminate the path
3100	buffer[--bufferSize] = '\0';
3101
3102	while (true) {
3103		while (vnode->covers != NULL)
3104			vnode = vnode->covers;
3105
3106		if (vnode == sRoot) {
3107			_truncated = bufferSize == 0;
3108			if (!_truncated)
3109				buffer[--bufferSize] = '/';
3110			return buffer + bufferSize;
3111		}
3112
3113		// resolve the name
3114		ino_t dirID;
3115		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3116			vnode->id, dirID);
3117		if (name == NULL) {
3118			// Failed to resolve the name -- prepend "<dev,node>/".
3119			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3120				vnode->mount->id, vnode->id);
3121			return buffer + bufferSize;
3122		}
3123
3124		// prepend the name
3125		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3126			_truncated = true;
3127			return buffer + bufferSize;
3128		}
3129
3130		// resolve the directory node
3131		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3132		if (nextVnode == NULL) {
3133			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3134				vnode->mount->id, dirID);
3135			return buffer + bufferSize;
3136		}
3137
3138		vnode = nextVnode;
3139	}
3140}
3141
3142
3143static void
3144_dump_vnode(struct vnode* vnode, bool printPath)
3145{
3146	kprintf("VNODE: %p\n", vnode);
3147	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3148	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3149	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3150	kprintf(" private_node:  %p\n", vnode->private_node);
3151	kprintf(" mount:         %p\n", vnode->mount);
3152	kprintf(" covered_by:    %p\n", vnode->covered_by);
3153	kprintf(" covers:        %p\n", vnode->covers);
3154	kprintf(" cache:         %p\n", vnode->cache);
3155	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3156	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3157		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3158	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3159
3160	_dump_advisory_locking(vnode->advisory_locking);
3161
3162	if (printPath) {
3163		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3164		if (buffer != NULL) {
3165			bool truncated;
3166			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3167				B_PATH_NAME_LENGTH, truncated);
3168			if (path != NULL) {
3169				kprintf(" path:          ");
3170				if (truncated)
3171					kputs("<truncated>/");
3172				kputs(path);
3173				kputs("\n");
3174			} else
3175				kprintf("Failed to resolve vnode path.\n");
3176
3177			debug_free(buffer);
3178		} else
3179			kprintf("Failed to allocate memory for constructing the path.\n");
3180	}
3181
3182	set_debug_variable("_node", (addr_t)vnode->private_node);
3183	set_debug_variable("_mount", (addr_t)vnode->mount);
3184	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3185	set_debug_variable("_covers", (addr_t)vnode->covers);
3186	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3187}
3188
3189
3190static int
3191dump_mount(int argc, char** argv)
3192{
3193	if (argc != 2 || !strcmp(argv[1], "--help")) {
3194		kprintf("usage: %s [id|address]\n", argv[0]);
3195		return 0;
3196	}
3197
3198	ulong val = parse_expression(argv[1]);
3199	uint32 id = val;
3200
3201	struct fs_mount* mount = sMountsTable->Lookup(id);
3202	if (mount == NULL) {
3203		if (IS_USER_ADDRESS(id)) {
3204			kprintf("fs_mount not found\n");
3205			return 0;
3206		}
3207		mount = (fs_mount*)val;
3208	}
3209
3210	_dump_mount(mount);
3211	return 0;
3212}
3213
3214
3215static int
3216dump_mounts(int argc, char** argv)
3217{
3218	if (argc != 1) {
3219		kprintf("usage: %s\n", argv[0]);
3220		return 0;
3221	}
3222
3223	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3224		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3225		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3226
3227	struct fs_mount* mount;
3228
3229	MountTable::Iterator iterator(sMountsTable);
3230	while (iterator.HasNext()) {
3231		mount = iterator.Next();
3232		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3233			mount->root_vnode->covers, mount->volume->private_volume,
3234			mount->volume->file_system_name);
3235
3236		fs_volume* volume = mount->volume;
3237		while (volume->super_volume != NULL) {
3238			volume = volume->super_volume;
3239			kprintf("                                     %p %s\n",
3240				volume->private_volume, volume->file_system_name);
3241		}
3242	}
3243
3244	return 0;
3245}
3246
3247
3248static int
3249dump_vnode(int argc, char** argv)
3250{
3251	bool printPath = false;
3252	int argi = 1;
3253	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3254		printPath = true;
3255		argi++;
3256	}
3257
3258	if (argi >= argc || argi + 2 < argc) {
3259		print_debugger_command_usage(argv[0]);
3260		return 0;
3261	}
3262
3263	struct vnode* vnode = NULL;
3264
3265	if (argi + 1 == argc) {
3266		vnode = (struct vnode*)parse_expression(argv[argi]);
3267		if (IS_USER_ADDRESS(vnode)) {
3268			kprintf("invalid vnode address\n");
3269			return 0;
3270		}
3271		_dump_vnode(vnode, printPath);
3272		return 0;
3273	}
3274
3275	dev_t device = parse_expression(argv[argi]);
3276	ino_t id = parse_expression(argv[argi + 1]);
3277
3278	VnodeTable::Iterator iterator(sVnodeTable);
3279	while (iterator.HasNext()) {
3280		vnode = iterator.Next();
3281		if (vnode->id != id || vnode->device != device)
3282			continue;
3283
3284		_dump_vnode(vnode, printPath);
3285	}
3286
3287	return 0;
3288}
3289
3290
3291static int
3292dump_vnodes(int argc, char** argv)
3293{
3294	if (argc != 2 || !strcmp(argv[1], "--help")) {
3295		kprintf("usage: %s [device]\n", argv[0]);
3296		return 0;
3297	}
3298
3299	// restrict dumped nodes to a certain device if requested
3300	dev_t device = parse_expression(argv[1]);
3301
3302	struct vnode* vnode;
3303
3304	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3305		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3306		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3307
3308	VnodeTable::Iterator iterator(sVnodeTable);
3309	while (iterator.HasNext()) {
3310		vnode = iterator.Next();
3311		if (vnode->device != device)
3312			continue;
3313
3314		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3315			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3316			vnode->private_node, vnode->advisory_locking,
3317			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3318			vnode->IsUnpublished() ? "u" : "-");
3319	}
3320
3321	return 0;
3322}
3323
3324
3325static int
3326dump_vnode_caches(int argc, char** argv)
3327{
3328	struct vnode* vnode;
3329
3330	if (argc > 2 || !strcmp(argv[1], "--help")) {
3331		kprintf("usage: %s [device]\n", argv[0]);
3332		return 0;
3333	}
3334
3335	// restrict dumped nodes to a certain device if requested
3336	dev_t device = -1;
3337	if (argc > 1)
3338		device = parse_expression(argv[1]);
3339
3340	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3341		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3342
3343	VnodeTable::Iterator iterator(sVnodeTable);
3344	while (iterator.HasNext()) {
3345		vnode = iterator.Next();
3346		if (vnode->cache == NULL)
3347			continue;
3348		if (device != -1 && vnode->device != device)
3349			continue;
3350
3351		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3352			vnode, vnode->device, vnode->id, vnode->cache,
3353			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3354			vnode->cache->page_count);
3355	}
3356
3357	return 0;
3358}
3359
3360
3361int
3362dump_io_context(int argc, char** argv)
3363{
3364	if (argc > 2 || !strcmp(argv[1], "--help")) {
3365		kprintf("usage: %s [team-id|address]\n", argv[0]);
3366		return 0;
3367	}
3368
3369	struct io_context* context = NULL;
3370
3371	if (argc > 1) {
3372		ulong num = parse_expression(argv[1]);
3373		if (IS_KERNEL_ADDRESS(num))
3374			context = (struct io_context*)num;
3375		else {
3376			Team* team = team_get_team_struct_locked(num);
3377			if (team == NULL) {
3378				kprintf("could not find team with ID %lu\n", num);
3379				return 0;
3380			}
3381			context = (struct io_context*)team->io_context;
3382		}
3383	} else
3384		context = get_current_io_context(true);
3385
3386	kprintf("I/O CONTEXT: %p\n", context);
3387	kprintf(" root vnode:\t%p\n", context->root);
3388	kprintf(" cwd vnode:\t%p\n", context->cwd);
3389	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3390	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3391
3392	if (context->num_used_fds) {
3393		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3394			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3395	}
3396
3397	for (uint32 i = 0; i < context->table_size; i++) {
3398		struct file_descriptor* fd = context->fds[i];
3399		if (fd == NULL)
3400			continue;
3401
3402		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3403			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3404			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3405			fd->pos, fd->cookie,
3406			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3407				? "mount" : "vnode",
3408			fd->u.vnode);
3409	}
3410
3411	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3412	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3413
3414	set_debug_variable("_cwd", (addr_t)context->cwd);
3415
3416	return 0;
3417}
3418
3419
3420int
3421dump_vnode_usage(int argc, char** argv)
3422{
3423	if (argc != 1) {
3424		kprintf("usage: %s\n", argv[0]);
3425		return 0;
3426	}
3427
3428	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3429		sUnusedVnodes, kMaxUnusedVnodes);
3430
3431	uint32 count = sVnodeTable->CountElements();
3432
3433	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3434		count - sUnusedVnodes);
3435	return 0;
3436}
3437
3438#endif	// ADD_DEBUGGER_COMMANDS
3439
3440
3441/*!	Clears memory specified by an iovec array.
3442*/
3443static void
3444zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3445{
3446	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3447		size_t length = std::min(vecs[i].iov_len, bytes);
3448		memset(vecs[i].iov_base, 0, length);
3449		bytes -= length;
3450	}
3451}
3452
3453
3454/*!	Does the dirty work of combining the file_io_vecs with the iovecs
3455	and calls the file system hooks to read/write the request to disk.
3456*/
3457static status_t
3458common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3459	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3460	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3461	bool doWrite)
3462{
3463	if (fileVecCount == 0) {
3464		// There are no file vecs at this offset, so we're obviously trying
3465		// to access the file outside of its bounds
3466		return B_BAD_VALUE;
3467	}
3468
3469	size_t numBytes = *_numBytes;
3470	uint32 fileVecIndex;
3471	size_t vecOffset = *_vecOffset;
3472	uint32 vecIndex = *_vecIndex;
3473	status_t status;
3474	size_t size;
3475
3476	if (!doWrite && vecOffset == 0) {
3477		// now directly read the data from the device
3478		// the first file_io_vec can be read directly
3479
3480		if (fileVecs[0].length < (off_t)numBytes)
3481			size = fileVecs[0].length;
3482		else
3483			size = numBytes;
3484
3485		if (fileVecs[0].offset >= 0) {
3486			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3487				&vecs[vecIndex], vecCount - vecIndex, &size);
3488		} else {
3489			// sparse read
3490			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3491			status = B_OK;
3492		}
3493		if (status != B_OK)
3494			return status;
3495
3496		// TODO: this is a work-around for buggy device drivers!
3497		//	When our own drivers honour the length, we can:
3498		//	a) also use this direct I/O for writes (otherwise, it would
3499		//	   overwrite precious data)
3500		//	b) panic if the term below is true (at least for writes)
3501		if ((off_t)size > fileVecs[0].length) {
3502			//dprintf("warning: device driver %p doesn't respect total length "
3503			//	"in read_pages() call!\n", ref->device);
3504			size = fileVecs[0].length;
3505		}
3506
3507		ASSERT((off_t)size <= fileVecs[0].length);
3508
3509		// If the file portion was contiguous, we're already done now
3510		if (size == numBytes)
3511			return B_OK;
3512
3513		// if we reached the end of the file, we can return as well
3514		if ((off_t)size != fileVecs[0].length) {
3515			*_numBytes = size;
3516			return B_OK;
3517		}
3518
3519		fileVecIndex = 1;
3520
3521		// first, find out where we have to continue in our iovecs
3522		for (; vecIndex < vecCount; vecIndex++) {
3523			if (size < vecs[vecIndex].iov_len)
3524				break;
3525
3526			size -= vecs[vecIndex].iov_len;
3527		}
3528
3529		vecOffset = size;
3530	} else {
3531		fileVecIndex = 0;
3532		size = 0;
3533	}
3534
3535	// Too bad, let's process the rest of the file_io_vecs
3536
3537	size_t totalSize = size;
3538	size_t bytesLeft = numBytes - size;
3539
3540	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3541		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3542		off_t fileOffset = fileVec.offset;
3543		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3544
3545		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3546			fileLeft));
3547
3548		// process the complete fileVec
3549		while (fileLeft > 0) {
3550			iovec tempVecs[MAX_TEMP_IO_VECS];
3551			uint32 tempCount = 0;
3552
3553			// size tracks how much of what is left of the current fileVec
3554			// (fileLeft) has been assigned to tempVecs
3555			size = 0;
3556
3557			// assign what is left of the current fileVec to the tempVecs
3558			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3559					&& tempCount < MAX_TEMP_IO_VECS;) {
3560				// try to satisfy one iovec per iteration (or as much as
3561				// possible)
3562
3563				// bytes left of the current iovec
3564				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3565				if (vecLeft == 0) {
3566					vecOffset = 0;
3567					vecIndex++;
3568					continue;
3569				}
3570
3571				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3572					vecIndex, vecOffset, size));
3573
3574				// actually available bytes
3575				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3576
3577				tempVecs[tempCount].iov_base
3578					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3579				tempVecs[tempCount].iov_len = tempVecSize;
3580				tempCount++;
3581
3582				size += tempVecSize;
3583				vecOffset += tempVecSize;
3584			}
3585
3586			size_t bytes = size;
3587
3588			if (fileOffset == -1) {
3589				if (doWrite) {
3590					panic("sparse write attempt: vnode %p", vnode);
3591					status = B_IO_ERROR;
3592				} else {
3593					// sparse read
3594					zero_iovecs(tempVecs, tempCount, bytes);
3595					status = B_OK;
3596				}
3597			} else if (doWrite) {
3598				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3599					tempVecs, tempCount, &bytes);
3600			} else {
3601				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3602					tempVecs, tempCount, &bytes);
3603			}
3604			if (status != B_OK)
3605				return status;
3606
3607			totalSize += bytes;
3608			bytesLeft -= size;
3609			if (fileOffset >= 0)
3610				fileOffset += size;
3611			fileLeft -= size;
3612			//dprintf("-> file left = %Lu\n", fileLeft);
3613
3614			if (size != bytes || vecIndex >= vecCount) {
3615				// there are no more bytes or iovecs, let's bail out
3616				*_numBytes = totalSize;
3617				return B_OK;
3618			}
3619		}
3620	}
3621
3622	*_vecIndex = vecIndex;
3623	*_vecOffset = vecOffset;
3624	*_numBytes = totalSize;
3625	return B_OK;
3626}
3627
3628
3629static bool
3630is_user_in_group(gid_t gid)
3631{
3632	if (gid == getegid())
3633		return true;
3634
3635	gid_t groups[NGROUPS_MAX];
3636	int groupCount = getgroups(NGROUPS_MAX, groups);
3637	for (int i = 0; i < groupCount; i++) {
3638		if (gid == groups[i])
3639			return true;
3640	}
3641
3642	return false;
3643}
3644
3645
3646static status_t
3647free_io_context(io_context* context)
3648{
3649	uint32 i;
3650
3651	TIOC(FreeIOContext(context));
3652
3653	if (context->root)
3654		put_vnode(context->root);
3655
3656	if (context->cwd)
3657		put_vnode(context->cwd);
3658
3659	mutex_lock(&context->io_mutex);
3660
3661	for (i = 0; i < context->table_size; i++) {
3662		if (struct file_descriptor* descriptor = context->fds[i]) {
3663			close_fd(context, descriptor);
3664			put_fd(descriptor);
3665		}
3666	}
3667
3668	mutex_destroy(&context->io_mutex);
3669
3670	remove_node_monitors(context);
3671	free(context->fds);
3672	free(context);
3673
3674	return B_OK;
3675}
3676
3677
3678static status_t
3679resize_monitor_table(struct io_context* context, const int newSize)
3680{
3681	int	status = B_OK;
3682
3683	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3684		return B_BAD_VALUE;
3685
3686	mutex_lock(&context->io_mutex);
3687
3688	if ((size_t)newSize < context->num_monitors) {
3689		status = B_BUSY;
3690		goto out;
3691	}
3692	context->max_monitors = newSize;
3693
3694out:
3695	mutex_unlock(&context->io_mutex);
3696	return status;
3697}
3698
3699
3700//	#pragma mark - public API for file systems
3701
3702
3703extern "C" status_t
3704new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3705	fs_vnode_ops* ops)
3706{
3707	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3708		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3709
3710	if (privateNode == NULL)
3711		return B_BAD_VALUE;
3712
3713	int32 tries = BUSY_VNODE_RETRIES;
3714restart:
3715	// create the node
3716	bool nodeCreated;
3717	struct vnode* vnode;
3718	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3719		nodeCreated);
3720	if (status != B_OK)
3721		return status;
3722
3723	WriteLocker nodeLocker(sVnodeLock, true);
3724		// create_new_vnode_and_lock() has locked for us
3725
3726	if (!nodeCreated && vnode->IsBusy()) {
3727		nodeLocker.Unlock();
3728		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3729			return B_BUSY;
3730		goto restart;
3731	}
3732
3733	// file system integrity check:
3734	// test if the vnode already exists and bail out if this is the case!
3735	if (!nodeCreated) {
3736		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3737			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3738			vnode->private_node);
3739		return B_ERROR;
3740	}
3741
3742	vnode->private_node = privateNode;
3743	vnode->ops = ops;
3744	vnode->SetUnpublished(true);
3745
3746	TRACE(("returns: %s\n", strerror(status)));
3747
3748	return status;
3749}
3750
3751
3752extern "C" status_t
3753publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3754	fs_vnode_ops* ops, int type, uint32 flags)
3755{
3756	FUNCTION(("publish_vnode()\n"));
3757
3758	int32 tries = BUSY_VNODE_RETRIES;
3759restart:
3760	WriteLocker locker(sVnodeLock);
3761
3762	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3763
3764	bool nodeCreated = false;
3765	if (vnode == NULL) {
3766		if (privateNode == NULL)
3767			return B_BAD_VALUE;
3768
3769		// create the node
3770		locker.Unlock();
3771			// create_new_vnode_and_lock() will re-lock for us on success
3772		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3773			nodeCreated);
3774		if (status != B_OK)
3775			return status;
3776
3777		locker.SetTo(sVnodeLock, true);
3778	}
3779
3780	if (nodeCreated) {
3781		vnode->private_node = privateNode;
3782		vnode->ops = ops;
3783		vnode->SetUnpublished(true);
3784	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3785		&& vnode->private_node == privateNode && vnode->ops == ops) {
3786		// already known, but not published
3787	} else if (vnode->IsBusy()) {
3788		locker.Unlock();
3789		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3790			return B_BUSY;
3791		goto restart;
3792	} else
3793		return B_BAD_VALUE;
3794
3795	bool publishSpecialSubNode = false;
3796
3797	vnode->SetType(type);
3798	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3799	publishSpecialSubNode = is_special_node_type(type)
3800		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3801
3802	status_t status = B_OK;
3803
3804	// create sub vnodes, if necessary
3805	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3806		locker.Unlock();
3807
3808		fs_volume* subVolume = volume;
3809		if (volume->sub_volume != NULL) {
3810			while (status == B_OK && subVolume->sub_volume != NULL) {
3811				subVolume = subVolume->sub_volume;
3812				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3813					vnode);
3814			}
3815		}
3816
3817		if (status == B_OK && publishSpecialSubNode)
3818			status = create_special_sub_node(vnode, flags);
3819
3820		if (status != B_OK) {
3821			// error -- clean up the created sub vnodes
3822			while (subVolume->super_volume != volume) {
3823				subVolume = subVolume->super_volume;
3824				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3825			}
3826		}
3827
3828		if (status == B_OK) {
3829			ReadLocker vnodesReadLocker(sVnodeLock);
3830			AutoLocker<Vnode> nodeLocker(vnode);
3831			vnode->SetBusy(false);
3832			vnode->SetUnpublished(false);
3833		} else {
3834			locker.Lock();
3835			sVnodeTable->Remove(vnode);
3836			remove_vnode_from_mount_list(vnode, vnode->mount);
3837			free(vnode);
3838		}
3839	} else {
3840		// we still hold the write lock -- mark the node unbusy and published
3841		vnode->SetBusy(false);
3842		vnode->SetUnpublished(false);
3843	}
3844
3845	TRACE(("returns: %s\n", strerror(status)));
3846
3847	return status;
3848}
3849
3850
3851extern "C" status_t
3852get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3853{
3854	struct vnode* vnode;
3855
3856	if (volume == NULL)
3857		return B_BAD_VALUE;
3858
3859	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3860	if (status != B_OK)
3861		return status;
3862
3863	// If this is a layered FS, we need to get the node cookie for the requested
3864	// layer.
3865	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3866		fs_vnode resolvedNode;
3867		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3868			&resolvedNode);
3869		if (status != B_OK) {
3870			panic("get_vnode(): Failed to get super node for vnode %p, "
3871				"volume: %p", vnode, volume);
3872			put_vnode(vnode);
3873			return status;
3874		}
3875
3876		if (_privateNode != NULL)
3877			*_privateNode = resolvedNode.private_node;
3878	} else if (_privateNode != NULL)
3879		*_privateNode = vnode->private_node;
3880
3881	return B_OK;
3882}
3883
3884
3885extern "C" status_t
3886acquire_vnode(fs_volume* volume, ino_t vnodeID)
3887{
3888	struct vnode* vnode;
3889
3890	rw_lock_read_lock(&sVnodeLock);
3891	vnode = lookup_vnode(volume->id, vnodeID);
3892	rw_lock_read_unlock(&sVnodeLock);
3893
3894	if (vnode == NULL)
3895		return B_BAD_VALUE;
3896
3897	inc_vnode_ref_count(vnode);
3898	return B_OK;
3899}
3900
3901
3902extern "C" status_t
3903put_vnode(fs_volume* volume, ino_t vnodeID)
3904{
3905	struct vnode* vnode;
3906
3907	rw_lock_read_lock(&sVnodeLock);
3908	vnode = lookup_vnode(volume->id, vnodeID);
3909	rw_lock_read_unlock(&sVnodeLock);
3910
3911	if (vnode == NULL)
3912		return B_BAD_VALUE;
3913
3914	dec_vnode_ref_count(vnode, false, true);
3915	return B_OK;
3916}
3917
3918
3919extern "C" status_t
3920remove_vnode(fs_volume* volume, ino_t vnodeID)
3921{
3922	ReadLocker locker(sVnodeLock);
3923
3924	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3925	if (vnode == NULL)
3926		return B_ENTRY_NOT_FOUND;
3927
3928	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3929		// this vnode is in use
3930		return B_BUSY;
3931	}
3932
3933	vnode->Lock();
3934
3935	vnode->SetRemoved(true);
3936	bool removeUnpublished = false;
3937
3938	if (vnode->IsUnpublished()) {
3939		// prepare the vnode for deletion
3940		removeUnpublished = true;
3941		vnode->SetBusy(true);
3942	}
3943
3944	vnode->Unlock();
3945	locker.Unlock();
3946
3947	if (removeUnpublished) {
3948		// If the vnode hasn't been published yet, we delete it here
3949		atomic_add(&vnode->ref_count, -1);
3950		free_vnode(vnode, true);
3951	}
3952
3953	return B_OK;
3954}
3955
3956
3957extern "C" status_t
3958unremove_vnode(fs_volume* volume, ino_t vnodeID)
3959{
3960	struct vnode* vnode;
3961
3962	rw_lock_read_lock(&sVnodeLock);
3963
3964	vnode = lookup_vnode(volume->id, vnodeID);
3965	if (vnode) {
3966		AutoLocker<Vnode> nodeLocker(vnode);
3967		vnode->SetRemoved(false);
3968	}
3969
3970	rw_lock_read_unlock(&sVnodeLock);
3971	return B_OK;
3972}
3973
3974
3975extern "C" status_t
3976get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3977{
3978	ReadLocker _(sVnodeLock);
3979
3980	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3981		if (_removed != NULL)
3982			*_removed = vnode->IsRemoved();
3983		return B_OK;
3984	}
3985
3986	return B_BAD_VALUE;
3987}
3988
3989
3990extern "C" status_t
3991mark_vnode_busy(fs_volume* volume, ino_t vnodeID, bool busy)
3992{
3993	ReadLocker locker(sVnodeLock);
3994
3995	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3996	if (vnode == NULL)
3997		return B_ENTRY_NOT_FOUND;
3998
3999	// are we trying to mark an already busy node busy again?
4000	if (busy && vnode->IsBusy())
4001		return B_BUSY;
4002
4003	vnode->Lock();
4004	vnode->SetBusy(busy);
4005	vnode->Unlock();
4006
4007	return B_OK;
4008}
4009
4010
4011extern "C" status_t
4012change_vnode_id(fs_volume* volume, ino_t vnodeID, ino_t newID)
4013{
4014	WriteLocker locker(sVnodeLock);
4015
4016	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
4017	if (vnode == NULL)
4018		return B_ENTRY_NOT_FOUND;
4019
4020	sVnodeTable->Remove(vnode);
4021	vnode->id = newID;
4022	sVnodeTable->Insert(vnode);
4023
4024	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
4025		((VMVnodeCache*)vnode->cache)->SetVnodeID(newID);
4026
4027	return B_OK;
4028}
4029
4030
4031extern "C" fs_volume*
4032volume_for_vnode(fs_vnode* _vnode)
4033{
4034	if (_vnode == NULL)
4035		return NULL;
4036
4037	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
4038	return vnode->mount->volume;
4039}
4040
4041
4042extern "C" status_t
4043check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
4044	uid_t nodeUserID)
4045{
4046	// get node permissions
4047	int userPermissions = (mode & S_IRWXU) >> 6;
4048	int groupPermissions = (mode & S_IRWXG) >> 3;
4049	int otherPermissions = mode & S_IRWXO;
4050
4051	// get the node permissions for this uid/gid
4052	int permissions = 0;
4053	uid_t uid = geteuid();
4054
4055	if (uid == 0) {
4056		// user is root
4057		// root has always read/write permission, but at least one of the
4058		// X bits must be set for execute permission
4059		permissions = userPermissions | groupPermissions | otherPermissions
4060			| S_IROTH | S_IWOTH;
4061		if (S_ISDIR(mode))
4062			permissions |= S_IXOTH;
4063	} else if (uid == nodeUserID) {
4064		// user is node owner
4065		permissions = userPermissions;
4066	} else if (is_user_in_group(nodeGroupID)) {
4067		// user is in owning group
4068		permissions = groupPermissions;
4069	} else {
4070		// user is one of the others
4071		permissions = otherPermissions;
4072	}
4073
4074	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4075}
4076
4077
4078#if 0
4079extern "C" status_t
4080read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4081	size_t* _numBytes)
4082{
4083	struct file_descriptor* descriptor;
4084	struct vnode* vnode;
4085
4086	descriptor = get_fd_and_vnode(fd, &vnode, true);
4087	if (descriptor == NULL)
4088		return B_FILE_ERROR;
4089
4090	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4091		count, 0, _numBytes);
4092
4093	put_fd(descriptor);
4094	return status;
4095}
4096
4097
4098extern "C" status_t
4099write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4100	size_t* _numBytes)
4101{
4102	struct file_descriptor* descriptor;
4103	struct vnode* vnode;
4104
4105	descriptor = get_fd_and_vnode(fd, &vnode, true);
4106	if (descriptor == NULL)
4107		return B_FILE_ERROR;
4108
4109	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4110		count, 0, _numBytes);
4111
4112	put_fd(descriptor);
4113	return status;
4114}
4115#endif
4116
4117
4118extern "C" status_t
4119read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4120	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4121	size_t* _bytes)
4122{
4123	struct file_descriptor* descriptor;
4124	struct vnode* vnode;
4125
4126	descriptor = get_fd_and_vnode(fd, &vnode, true);
4127	if (descriptor == NULL)
4128		return B_FILE_ERROR;
4129
4130	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4131		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4132		false);
4133
4134	put_fd(descriptor);
4135	return status;
4136}
4137
4138
4139extern "C" status_t
4140write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4141	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4142	size_t* _bytes)
4143{
4144	struct file_descriptor* descriptor;
4145	struct vnode* vnode;
4146
4147	descriptor = get_fd_and_vnode(fd, &vnode, true);
4148	if (descriptor == NULL)
4149		return B_FILE_ERROR;
4150
4151	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4152		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4153		true);
4154
4155	put_fd(descriptor);
4156	return status;
4157}
4158
4159
4160extern "C" status_t
4161entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4162{
4163	// lookup mount -- the caller is required to make sure that the mount
4164	// won't go away
4165	MutexLocker locker(sMountMutex);
4166	struct fs_mount* mount = find_mount(mountID);
4167	if (mount == NULL)
4168		return B_BAD_VALUE;
4169	locker.Unlock();
4170
4171	return mount->entry_cache.Add(dirID, name, nodeID, false);
4172}
4173
4174
4175extern "C" status_t
4176entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4177{
4178	// lookup mount -- the caller is required to make sure that the mount
4179	// won't go away
4180	MutexLocker locker(sMountMutex);
4181	struct fs_mount* mount = find_mount(mountID);
4182	if (mount == NULL)
4183		return B_BAD_VALUE;
4184	locker.Unlock();
4185
4186	return mount->entry_cache.Add(dirID, name, -1, true);
4187}
4188
4189
4190extern "C" status_t
4191entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4192{
4193	// lookup mount -- the caller is required to make sure that the mount
4194	// won't go away
4195	MutexLocker locker(sMountMutex);
4196	struct fs_mount* mount = find_mount(mountID);
4197	if (mount == NULL)
4198		return B_BAD_VALUE;
4199	locker.Unlock();
4200
4201	return mount->entry_cache.Remove(dirID, name);
4202}
4203
4204
4205//	#pragma mark - private VFS API
4206//	Functions the VFS exports for other parts of the kernel
4207
4208
4209/*! Acquires another reference to the vnode that has to be released
4210	by calling vfs_put_vnode().
4211*/
4212void
4213vfs_acquire_vnode(struct vnode* vnode)
4214{
4215	inc_vnode_ref_count(vnode);
4216}
4217
4218
4219/*! This is currently called from file_cache_create() only.
4220	It's probably a temporary solution as long as devfs requires that
4221	fs_read_pages()/fs_write_pages() are called with the standard
4222	open cookie and not with a device cookie.
4223	If that's done differently, remove this call; it has no other
4224	purpose.
4225*/
4226extern "C" status_t
4227vfs_get_cookie_from_fd(int fd, void** _cookie)
4228{
4229	struct file_descriptor* descriptor;
4230
4231	descriptor = get_fd(get_current_io_context(true), fd);
4232	if (descriptor == NULL)
4233		return B_FILE_ERROR;
4234
4235	*_cookie = descriptor->cookie;
4236	return B_OK;
4237}
4238
4239
4240extern "C" status_t
4241vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4242{
4243	*vnode = get_vnode_from_fd(fd, kernel);
4244
4245	if (*vnode == NULL)
4246		return B_FILE_ERROR;
4247
4248	return B_NO_ERROR;
4249}
4250
4251
4252extern "C" status_t
4253vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4254{
4255	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4256		path, kernel));
4257
4258	KPath pathBuffer;
4259	if (pathBuffer.InitCheck() != B_OK)
4260		return B_NO_MEMORY;
4261
4262	char* buffer = pathBuffer.LockBuffer();
4263	strlcpy(buffer, path, pathBuffer.BufferSize());
4264
4265	struct vnode* vnode;
4266	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4267	if (status != B_OK)
4268		return status;
4269
4270	*_vnode = vnode;
4271	return B_OK;
4272}
4273
4274
4275extern "C" status_t
4276vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4277{
4278	struct vnode* vnode = NULL;
4279
4280	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4281	if (status != B_OK)
4282		return status;
4283
4284	*_vnode = vnode;
4285	return B_OK;
4286}
4287
4288
4289extern "C" status_t
4290vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4291	const char* name, struct vnode** _vnode)
4292{
4293	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4294}
4295
4296
4297extern "C" void
4298vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4299{
4300	*_mountID = vnode->device;
4301	*_vnodeID = vnode->id;
4302}
4303
4304
4305/*!
4306	Helper function abstracting the process of "converting" a given
4307	vnode-pointer to a fs_vnode-pointer.
4308	Currently only used in bindfs.
4309*/
4310extern "C" fs_vnode*
4311vfs_fsnode_for_vnode(struct vnode* vnode)
4312{
4313	return vnode;
4314}
4315
4316
4317/*!
4318	Calls fs_open() on the given vnode and returns a new
4319	file descriptor for it
4320*/
4321int
4322vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4323{
4324	return open_vnode(vnode, openMode, kernel);
4325}
4326
4327
4328/*!	Looks up a vnode with the given mount and vnode ID.
4329	Must only be used with "in-use" vnodes as it doesn't grab a reference
4330	to the node.
4331	It's currently only be used by file_cache_create().
4332*/
4333extern "C" status_t
4334vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4335{
4336	rw_lock_read_lock(&sVnodeLock);
4337	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4338	rw_lock_read_unlock(&sVnodeLock);
4339
4340	if (vnode == NULL)
4341		return B_ERROR;
4342
4343	*_vnode = vnode;
4344	return B_OK;
4345}
4346
4347
4348extern "C" status_t
4349vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4350	bool traverseLeafLink, bool kernel, void** _node)
4351{
4352	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4353		volume, path, kernel));
4354
4355	KPath pathBuffer;
4356	if (pathBuffer.InitCheck() != B_OK)
4357		return B_NO_MEMORY;
4358
4359	fs_mount* mount;
4360	status_t status = get_mount(volume->id, &mount);
4361	if (status != B_OK)
4362		return status;
4363
4364	char* buffer = pathBuffer.LockBuffer();
4365	strlcpy(buffer, path, pathBuffer.BufferSize());
4366
4367	struct vnode* vnode = mount->root_vnode;
4368
4369	if (buffer[0] == '/')
4370		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4371	else {
4372		inc_vnode_ref_count(vnode);
4373			// vnode_path_to_vnode() releases a reference to the starting vnode
4374		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4375			kernel, &vnode, NULL);
4376	}
4377
4378	put_mount(mount);
4379
4380	if (status != B_OK)
4381		return status;
4382
4383	if (vnode->device != volume->id) {
4384		// wrong mount ID - must not gain access on foreign file system nodes
4385		put_vnode(vnode);
4386		return B_BAD_VALUE;
4387	}
4388
4389	// Use get_vnode() to resolve the cookie for the right layer.
4390	status = get_vnode(volume, vnode->id, _node);
4391	put_vnode(vnode);
4392
4393	return status;
4394}
4395
4396
4397status_t
4398vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4399	struct stat* stat, bool kernel)
4400{
4401	status_t status;
4402
4403	if (path != NULL) {
4404		// path given: get the stat of the node referred to by (fd, path)
4405		KPath pathBuffer(path);
4406		if (pathBuffer.InitCheck() != B_OK)
4407			return B_NO_MEMORY;
4408
4409		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4410			traverseLeafLink, stat, kernel);
4411	} else {
4412		// no path given: get the FD and use the FD operation
4413		struct file_descriptor* descriptor
4414			= get_fd(get_current_io_context(kernel), fd);
4415		if (descriptor == NULL)
4416			return B_FILE_ERROR;
4417
4418		if (descriptor->ops->fd_read_stat)
4419			status = descriptor->ops->fd_read_stat(descriptor, stat);
4420		else
4421			status = B_UNSUPPORTED;
4422
4423		put_fd(descriptor);
4424	}
4425
4426	return status;
4427}
4428
4429
4430/*!	Finds the full path to the file that contains the module \a moduleName,
4431	puts it into \a pathBuffer, and returns B_OK for success.
4432	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4433	\c B_ENTRY_NOT_FOUNT if no file could be found.
4434	\a pathBuffer is clobbered in any case and must not be relied on if this
4435	functions returns unsuccessfully.
4436	\a basePath and \a pathBuffer must not point to the same space.
4437*/
4438status_t
4439vfs_get_module_path(const char* basePath, const char* moduleName,
4440	char* pathBuffer, size_t bufferSize)
4441{
4442	struct vnode* dir;
4443	struct vnode* file;
4444	status_t status;
4445	size_t length;
4446	char* path;
4447
4448	if (bufferSize == 0
4449		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4450		return B_BUFFER_OVERFLOW;
4451
4452	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4453	if (status != B_OK)
4454		return status;
4455
4456	// the path buffer had been clobbered by the above call
4457	length = strlcpy(pathBuffer, basePath, bufferSize);
4458	if (pathBuffer[length - 1] != '/')
4459		pathBuffer[length++] = '/';
4460
4461	path = pathBuffer + length;
4462	bufferSize -= length;
4463
4464	while (moduleName) {
4465		char* nextPath = strchr(moduleName, '/');
4466		if (nextPath == NULL)
4467			length = strlen(moduleName);
4468		else {
4469			length = nextPath - moduleName;
4470			nextPath++;
4471		}
4472
4473		if (length + 1 >= bufferSize) {
4474			status = B_BUFFER_OVERFLOW;
4475			goto err;
4476		}
4477
4478		memcpy(path, moduleName, length);
4479		path[length] = '\0';
4480		moduleName = nextPath;
4481
4482		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4483		if (status != B_OK) {
4484			// vnode_path_to_vnode() has already released the reference to dir
4485			return status;
4486		}
4487
4488		if (S_ISDIR(file->Type())) {
4489			// goto the next directory
4490			path[length] = '/';
4491			path[length + 1] = '\0';
4492			path += length + 1;
4493			bufferSize -= length + 1;
4494
4495			dir = file;
4496		} else if (S_ISREG(file->Type())) {
4497			// it's a file so it should be what we've searched for
4498			put_vnode(file);
4499
4500			return B_OK;
4501		} else {
4502			TRACE(("vfs_get_module_path(): something is strange here: "
4503				"0x%08" B_PRIx32 "...\n", file->Type()));
4504			status = B_ERROR;
4505			dir = file;
4506			goto err;
4507		}
4508	}
4509
4510	// if we got here, the moduleName just pointed to a directory, not to
4511	// a real module - what should we do in this case?
4512	status = B_ENTRY_NOT_FOUND;
4513
4514err:
4515	put_vnode(dir);
4516	return status;
4517}
4518
4519
4520/*!	\brief Normalizes a given path.
4521
4522	The path must refer to an existing or non-existing entry in an existing
4523	directory, that is chopping off the leaf component the remaining path must
4524	refer to an existing directory.
4525
4526	The returned will be canonical in that it will be absolute, will not
4527	contain any "." or ".." components or duplicate occurrences of '/'s,
4528	and none of the directory components will by symbolic links.
4529
4530	Any two paths referring to the same entry, will result in the same
4531	normalized path (well, that is pretty much the definition of `normalized',
4532	isn't it :-).
4533
4534	\param path The path to be normalized.
4535	\param buffer The buffer into which the normalized path will be written.
4536		   May be the same one as \a path.
4537	\param bufferSize The size of \a buffer.
4538	\param traverseLink If \c true, the function also resolves leaf symlinks.
4539	\param kernel \c true, if the IO context of the kernel shall be used,
4540		   otherwise that of the team this thread belongs to. Only relevant,
4541		   if the path is relative (to get the CWD).
4542	\return \c B_OK if everything went fine, another error code otherwise.
4543*/
4544status_t
4545vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4546	bool traverseLink, bool kernel)
4547{
4548	if (!path || !buffer || bufferSize < 1)
4549		return B_BAD_VALUE;
4550
4551	if (path != buffer) {
4552		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4553			return B_BUFFER_OVERFLOW;
4554	}
4555
4556	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4557}
4558
4559
4560/*!	\brief Gets the parent of the passed in node.
4561
4562	Gets the parent of the passed in node, and correctly resolves covered
4563	nodes.
4564*/
4565extern "C" status_t
4566vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4567{
4568	return resolve_covered_parent(parent, device, node,
4569		get_current_io_context(true));
4570}
4571
4572
4573/*!	\brief Creates a special node in the file system.
4574
4575	The caller gets a reference to the newly created node (which is passed
4576	back through \a _createdVnode) and is responsible for releasing it.
4577
4578	\param path The path where to create the entry for the node. Can be \c NULL,
4579		in which case the node is created without an entry in the root FS -- it
4580		will automatically be deleted when the last reference has been released.
4581	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4582		the target file system will just create the node with its standard
4583		operations. Depending on the type of the node a subnode might be created
4584		automatically, though.
4585	\param mode The type and permissions for the node to be created.
4586	\param flags Flags to be passed to the creating FS.
4587	\param kernel \c true, if called in the kernel context (relevant only if
4588		\a path is not \c NULL and not absolute).
4589	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4590		file system creating the node, with the private data pointer and
4591		operations for the super node. Can be \c NULL.
4592	\param _createVnode Pointer to pre-allocated storage where to store the
4593		pointer to the newly created node.
4594	\return \c B_OK, if everything went fine, another error code otherwise.
4595*/
4596status_t
4597vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4598	uint32 flags, bool kernel, fs_vnode* _superVnode,
4599	struct vnode** _createdVnode)
4600{
4601	struct vnode* dirNode;
4602	char _leaf[B_FILE_NAME_LENGTH];
4603	char* leaf = NULL;
4604
4605	if (path) {
4606		// We've got a path. Get the dir vnode and the leaf name.
4607		KPath tmpPathBuffer;
4608		if (tmpPathBuffer.InitCheck() != B_OK)
4609			return B_NO_MEMORY;
4610
4611		char* tmpPath = tmpPathBuffer.LockBuffer();
4612		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4613			return B_NAME_TOO_LONG;
4614
4615		// get the dir vnode and the leaf name
4616		leaf = _leaf;
4617		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4618		if (error != B_OK)
4619			return error;
4620	} else {
4621		// No path. Create the node in the root FS.
4622		dirNode = sRoot;
4623		inc_vnode_ref_count(dirNode);
4624	}
4625
4626	VNodePutter _(dirNode);
4627
4628	// check support for creating special nodes
4629	if (!HAS_FS_CALL(dirNode, create_special_node))
4630		return B_UNSUPPORTED;
4631
4632	// create the node
4633	fs_vnode superVnode;
4634	ino_t nodeID;
4635	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4636		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4637	if (status != B_OK)
4638		return status;
4639
4640	// lookup the node
4641	rw_lock_read_lock(&sVnodeLock);
4642	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4643	rw_lock_read_unlock(&sVnodeLock);
4644
4645	if (*_createdVnode == NULL) {
4646		panic("vfs_create_special_node(): lookup of node failed");
4647		return B_ERROR;
4648	}
4649
4650	return B_OK;
4651}
4652
4653
4654extern "C" void
4655vfs_put_vnode(struct vnode* vnode)
4656{
4657	put_vnode(vnode);
4658}
4659
4660
4661extern "C" status_t
4662vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4663{
4664	// Get current working directory from io context
4665	struct io_context* context = get_current_io_context(false);
4666	status_t status = B_OK;
4667
4668	mutex_lock(&context->io_mutex);
4669
4670	if (context->cwd != NULL) {
4671		*_mountID = context->cwd->device;
4672		*_vnodeID = context->cwd->id;
4673	} else
4674		status = B_ERROR;
4675
4676	mutex_unlock(&context->io_mutex);
4677	return status;
4678}
4679
4680
4681status_t
4682vfs_unmount(dev_t mountID, uint32 flags)
4683{
4684	return fs_unmount(NULL, mountID, flags, true);
4685}
4686
4687
4688extern "C" status_t
4689vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4690{
4691	struct vnode* vnode;
4692
4693	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4694	if (status != B_OK)
4695		return status;
4696
4697	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4698	put_vnode(vnode);
4699	return B_OK;
4700}
4701
4702
4703extern "C" void
4704vfs_free_unused_vnodes(int32 level)
4705{
4706	vnode_low_resource_handler(NULL,
4707		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4708			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4709		level);
4710}
4711
4712
4713extern "C" bool
4714vfs_can_page(struct vnode* vnode, void* cookie)
4715{
4716	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4717
4718	if (HAS_FS_CALL(vnode, can_page))
4719		return FS_CALL(vnode, can_page, cookie);
4720	return false;
4721}
4722
4723
4724extern "C" status_t
4725vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4726	const generic_io_vec* vecs, size_t count, uint32 flags,
4727	generic_size_t* _numBytes)
4728{
4729	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4730		vecs, pos));
4731
4732#if VFS_PAGES_IO_TRACING
4733	generic_size_t bytesRequested = *_numBytes;
4734#endif
4735
4736	IORequest request;
4737	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4738	if (status == B_OK) {
4739		status = vfs_vnode_io(vnode, cookie, &request);
4740		if (status == B_OK)
4741			status = request.Wait();
4742		*_numBytes = request.TransferredBytes();
4743	}
4744
4745	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4746		status, *_numBytes));
4747
4748	return status;
4749}
4750
4751
4752extern "C" status_t
4753vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4754	const generic_io_vec* vecs, size_t count, uint32 flags,
4755	generic_size_t* _numBytes)
4756{
4757	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4758		vecs, pos));
4759
4760#if VFS_PAGES_IO_TRACING
4761	generic_size_t bytesRequested = *_numBytes;
4762#endif
4763
4764	IORequest request;
4765	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4766	if (status == B_OK) {
4767		status = vfs_vnode_io(vnode, cookie, &request);
4768		if (status == B_OK)
4769			status = request.Wait();
4770		*_numBytes = request.TransferredBytes();
4771	}
4772
4773	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4774		status, *_numBytes));
4775
4776	return status;
4777}
4778
4779
4780/*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4781	created if \a allocate is \c true.
4782	In case it's successful, it will also grab a reference to the cache
4783	it returns.
4784*/
4785extern "C" status_t
4786vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4787{
4788	if (vnode->cache != NULL) {
4789		vnode->cache->AcquireRef();
4790		*_cache = vnode->cache;
4791		return B_OK;
4792	}
4793
4794	rw_lock_read_lock(&sVnodeLock);
4795	vnode->Lock();
4796
4797	status_t status = B_OK;
4798
4799	// The cache could have been created in the meantime
4800	if (vnode->cache == NULL) {
4801		if (allocate) {
4802			// TODO: actually the vnode needs to be busy already here, or
4803			//	else this won't work...
4804			bool wasBusy = vnode->IsBusy();
4805			vnode->SetBusy(true);
4806
4807			vnode->Unlock();
4808			rw_lock_read_unlock(&sVnodeLock);
4809
4810			status = vm_create_vnode_cache(vnode, &vnode->cache);
4811
4812			rw_lock_read_lock(&sVnodeLock);
4813			vnode->Lock();
4814			vnode->SetBusy(wasBusy);
4815		} else
4816			status = B_BAD_VALUE;
4817	}
4818
4819	vnode->Unlock();
4820	rw_lock_read_unlock(&sVnodeLock);
4821
4822	if (status == B_OK) {
4823		vnode->cache->AcquireRef();
4824		*_cache = vnode->cache;
4825	}
4826
4827	return status;
4828}
4829
4830
4831/*!	Sets the vnode's VMCache object, for subsystems that want to manage
4832	their own.
4833	In case it's successful, it will also grab a reference to the cache
4834	it returns.
4835*/
4836extern "C" status_t
4837vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4838{
4839	rw_lock_read_lock(&sVnodeLock);
4840	vnode->Lock();
4841
4842	status_t status = B_OK;
4843	if (vnode->cache != NULL) {
4844		status = B_NOT_ALLOWED;
4845	} else {
4846		vnode->cache = _cache;
4847		_cache->AcquireRef();
4848	}
4849
4850	vnode->Unlock();
4851	rw_lock_read_unlock(&sVnodeLock);
4852	return status;
4853}
4854
4855
4856status_t
4857vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4858	file_io_vec* vecs, size_t* _count)
4859{
4860	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4861		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4862
4863	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4864}
4865
4866
4867status_t
4868vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4869{
4870	status_t status = FS_CALL(vnode, read_stat, stat);
4871
4872	// fill in the st_dev and st_ino fields
4873	if (status == B_OK) {
4874		stat->st_dev = vnode->device;
4875		stat->st_ino = vnode->id;
4876		// the rdev field must stay unset for non-special files
4877		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4878			stat->st_rdev = -1;
4879	}
4880
4881	return status;
4882}
4883
4884
4885status_t
4886vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4887{
4888	struct vnode* vnode;
4889	status_t status = get_vnode(device, inode, &vnode, true, false);
4890	if (status != B_OK)
4891		return status;
4892
4893	status = vfs_stat_vnode(vnode, stat);
4894
4895	put_vnode(vnode);
4896	return status;
4897}
4898
4899
4900status_t
4901vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4902{
4903	return get_vnode_name(vnode, NULL, name, nameSize, true);
4904}
4905
4906
4907status_t
4908vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4909	bool kernel, char* path, size_t pathLength)
4910{
4911	struct vnode* vnode;
4912	status_t status;
4913
4914	// filter invalid leaf names
4915	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4916		return B_BAD_VALUE;
4917
4918	// get the vnode matching the dir's node_ref
4919	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4920		// special cases "." and "..": we can directly get the vnode of the
4921		// referenced directory
4922		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4923		leaf = NULL;
4924	} else
4925		status = get_vnode(device, inode, &vnode, true, false);
4926	if (status != B_OK)
4927		return status;
4928
4929	// get the directory path
4930	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4931	put_vnode(vnode);
4932		// we don't need the vnode anymore
4933	if (status != B_OK)
4934		return status;
4935
4936	// append the leaf name
4937	if (leaf) {
4938		// insert a directory separator if this is not the file system root
4939		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4940				>= pathLength)
4941			|| strlcat(path, leaf, pathLength) >= pathLength) {
4942			return B_NAME_TOO_LONG;
4943		}
4944	}
4945
4946	return B_OK;
4947}
4948
4949
4950/*!	If the given descriptor locked its vnode, that lock will be released. */
4951void
4952vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4953{
4954	struct vnode* vnode = fd_vnode(descriptor);
4955
4956	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4957		vnode->mandatory_locked_by = NULL;
4958}
4959
4960
4961/*!	Releases any POSIX locks on the file descriptor. */
4962status_t
4963vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4964{
4965	struct vnode* vnode = descriptor->u.vnode;
4966	if (vnode == NULL)
4967		return B_OK;
4968
4969	if (HAS_FS_CALL(vnode, release_lock))
4970		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4971
4972	return release_advisory_lock(vnode, context, NULL, NULL);
4973}
4974
4975
4976/*!	Closes all file descriptors of the specified I/O context that
4977	have the O_CLOEXEC flag set.
4978*/
4979void
4980vfs_exec_io_context(io_context* context)
4981{
4982	uint32 i;
4983
4984	for (i = 0; i < context->table_size; i++) {
4985		mutex_lock(&context->io_mutex);
4986
4987		struct file_descriptor* descriptor = context->fds[i];
4988		bool remove = false;
4989
4990		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4991			context->fds[i] = NULL;
4992			context->num_used_fds--;
4993
4994			remove = true;
4995		}
4996
4997		mutex_unlock(&context->io_mutex);
4998
4999		if (remove) {
5000			close_fd(context, descriptor);
5001			put_fd(descriptor);
5002		}
5003	}
5004}
5005
5006
5007/*! Sets up a new io_control structure, and inherits the properties
5008	of the parent io_control if it is given.
5009*/
5010io_context*
5011vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
5012{
5013	io_context* context = (io_context*)malloc(sizeof(io_context));
5014	if (context == NULL)
5015		return NULL;
5016
5017	TIOC(NewIOContext(context, parentContext));
5018
5019	memset(context, 0, sizeof(io_context));
5020	context->ref_count = 1;
5021
5022	MutexLocker parentLocker;
5023
5024	size_t tableSize;
5025	if (parentContext != NULL) {
5026		parentLocker.SetTo(parentContext->io_mutex, false);
5027		tableSize = parentContext->table_size;
5028	} else
5029		tableSize = DEFAULT_FD_TABLE_SIZE;
5030
5031	// allocate space for FDs and their close-on-exec flag
5032	context->fds = (file_descriptor**)malloc(
5033		sizeof(struct file_descriptor*) * tableSize
5034		+ sizeof(struct select_sync*) * tableSize
5035		+ (tableSize + 7) / 8);
5036	if (context->fds == NULL) {
5037		free(context);
5038		return NULL;
5039	}
5040
5041	context->select_infos = (select_info**)(context->fds + tableSize);
5042	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
5043
5044	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
5045		+ sizeof(struct select_sync*) * tableSize
5046		+ (tableSize + 7) / 8);
5047
5048	mutex_init(&context->io_mutex, "I/O context");
5049
5050	// Copy all parent file descriptors
5051
5052	if (parentContext != NULL) {
5053		size_t i;
5054
5055		mutex_lock(&sIOContextRootLock);
5056		context->root = parentContext->root;
5057		if (context->root)
5058			inc_vnode_ref_count(context->root);
5059		mutex_unlock(&sIOContextRootLock);
5060
5061		context->cwd = parentContext->cwd;
5062		if (context->cwd)
5063			inc_vnode_ref_count(context->cwd);
5064
5065		if (parentContext->inherit_fds) {
5066			for (i = 0; i < tableSize; i++) {
5067				struct file_descriptor* descriptor = parentContext->fds[i];
5068
5069				if (descriptor != NULL
5070					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
5071					bool closeOnExec = fd_close_on_exec(parentContext, i);
5072					if (closeOnExec && purgeCloseOnExec)
5073						continue;
5074
5075					TFD(InheritFD(context, i, descriptor, parentContext));
5076
5077					context->fds[i] = descriptor;
5078					context->num_used_fds++;
5079					atomic_add(&descriptor->ref_count, 1);
5080					atomic_add(&descriptor->open_count, 1);
5081
5082					if (closeOnExec)
5083						fd_set_close_on_exec(context, i, true);
5084				}
5085			}
5086		}
5087
5088		parentLocker.Unlock();
5089	} else {
5090		context->root = sRoot;
5091		context->cwd = sRoot;
5092
5093		if (context->root)
5094			inc_vnode_ref_count(context->root);
5095
5096		if (context->cwd)
5097			inc_vnode_ref_count(context->cwd);
5098	}
5099
5100	context->table_size = tableSize;
5101	context->inherit_fds = parentContext != NULL;
5102
5103	list_init(&context->node_monitors);
5104	context->max_monitors = DEFAULT_NODE_MONITORS;
5105
5106	return context;
5107}
5108
5109
5110void
5111vfs_get_io_context(io_context* context)
5112{
5113	atomic_add(&context->ref_count, 1);
5114}
5115
5116
5117void
5118vfs_put_io_context(io_context* context)
5119{
5120	if (atomic_add(&context->ref_count, -1) == 1)
5121		free_io_context(context);
5122}
5123
5124
5125status_t
5126vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5127{
5128	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5129		return B_BAD_VALUE;
5130
5131	TIOC(ResizeIOContext(context, newSize));
5132
5133	MutexLocker _(context->io_mutex);
5134
5135	uint32 oldSize = context->table_size;
5136	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5137	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5138
5139	// If the tables shrink, make sure none of the fds being dropped are in use.
5140	if (newSize < oldSize) {
5141		for (uint32 i = oldSize; i-- > newSize;) {
5142			if (context->fds[i])
5143				return B_BUSY;
5144		}
5145	}
5146
5147	// store pointers to the old tables
5148	file_descriptor** oldFDs = context->fds;
5149	select_info** oldSelectInfos = context->select_infos;
5150	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5151
5152	// allocate new tables
5153	file_descriptor** newFDs = (file_descriptor**)malloc(
5154		sizeof(struct file_descriptor*) * newSize
5155		+ sizeof(struct select_sync*) * newSize
5156		+ newCloseOnExitBitmapSize);
5157	if (newFDs == NULL)
5158		return B_NO_MEMORY;
5159
5160	context->fds = newFDs;
5161	context->select_infos = (select_info**)(context->fds + newSize);
5162	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5163	context->table_size = newSize;
5164
5165	// copy entries from old tables
5166	uint32 toCopy = min_c(oldSize, newSize);
5167
5168	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5169	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5170	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5171		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5172
5173	// clear additional entries, if the tables grow
5174	if (newSize > oldSize) {
5175		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5176		memset(context->select_infos + oldSize, 0,
5177			sizeof(void*) * (newSize - oldSize));
5178		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5179			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5180	}
5181
5182	free(oldFDs);
5183
5184	return B_OK;
5185}
5186
5187
5188/*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5189
5190	Given an arbitrary vnode (identified by mount and node ID), the function
5191	checks, whether the vnode is covered by another vnode. If it is, the
5192	function returns the mount and node ID of the covering vnode. Otherwise
5193	it simply returns the supplied mount and node ID.
5194
5195	In case of error (e.g. the supplied node could not be found) the variables
5196	for storing the resolved mount and node ID remain untouched and an error
5197	code is returned.
5198
5199	\param mountID The mount ID of the vnode in question.
5200	\param nodeID The node ID of the vnode in question.
5201	\param resolvedMountID Pointer to storage for the resolved mount ID.
5202	\param resolvedNodeID Pointer to storage for the resolved node ID.
5203	\return
5204	- \c B_OK, if everything went fine,
5205	- another error code, if something went wrong.
5206*/
5207status_t
5208vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5209	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5210{
5211	// get the node
5212	struct vnode* node;
5213	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5214	if (error != B_OK)
5215		return error;
5216
5217	// resolve the node
5218	if (Vnode* coveringNode = get_covering_vnode(node)) {
5219		put_vnode(node);
5220		node = coveringNode;
5221	}
5222
5223	// set the return values
5224	*resolvedMountID = node->device;
5225	*resolvedNodeID = node->id;
5226
5227	put_vnode(node);
5228
5229	return B_OK;
5230}
5231
5232
5233status_t
5234vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5235	ino_t* _mountPointNodeID)
5236{
5237	ReadLocker nodeLocker(sVnodeLock);
5238	MutexLocker mountLocker(sMountMutex);
5239
5240	struct fs_mount* mount = find_mount(mountID);
5241	if (mount == NULL)
5242		return B_BAD_VALUE;
5243
5244	Vnode* mountPoint = mount->covers_vnode;
5245
5246	*_mountPointMountID = mountPoint->device;
5247	*_mountPointNodeID = mountPoint->id;
5248
5249	return B_OK;
5250}
5251
5252
5253status_t
5254vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5255	ino_t coveredNodeID)
5256{
5257	// get the vnodes
5258	Vnode* vnode;
5259	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5260	if (error != B_OK)
5261		return B_BAD_VALUE;
5262	VNodePutter vnodePutter(vnode);
5263
5264	Vnode* coveredVnode;
5265	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5266		false);
5267	if (error != B_OK)
5268		return B_BAD_VALUE;
5269	VNodePutter coveredVnodePutter(coveredVnode);
5270
5271	// establish the covered/covering links
5272	WriteLocker locker(sVnodeLock);
5273
5274	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5275		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5276		return B_BUSY;
5277	}
5278
5279	vnode->covers = coveredVnode;
5280	vnode->SetCovering(true);
5281
5282	coveredVnode->covered_by = vnode;
5283	coveredVnode->SetCovered(true);
5284
5285	// the vnodes do now reference each other
5286	inc_vnode_ref_count(vnode);
5287	inc_vnode_ref_count(coveredVnode);
5288
5289	return B_OK;
5290}
5291
5292
5293int
5294vfs_getrlimit(int resource, struct rlimit* rlp)
5295{
5296	if (!rlp)
5297		return B_BAD_ADDRESS;
5298
5299	switch (resource) {
5300		case RLIMIT_NOFILE:
5301		{
5302			struct io_context* context = get_current_io_context(false);
5303			MutexLocker _(context->io_mutex);
5304
5305			rlp->rlim_cur = context->table_size;
5306			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5307			return 0;
5308		}
5309
5310		case RLIMIT_NOVMON:
5311		{
5312			struct io_context* context = get_current_io_context(false);
5313			MutexLocker _(context->io_mutex);
5314
5315			rlp->rlim_cur = context->max_monitors;
5316			rlp->rlim_max = MAX_NODE_MONITORS;
5317			return 0;
5318		}
5319
5320		default:
5321			return B_BAD_VALUE;
5322	}
5323}
5324
5325
5326int
5327vfs_setrlimit(int resource, const struct rlimit* rlp)
5328{
5329	if (!rlp)
5330		return B_BAD_ADDRESS;
5331
5332	switch (resource) {
5333		case RLIMIT_NOFILE:
5334			/* TODO: check getuid() */
5335			if (rlp->rlim_max != RLIM_SAVED_MAX
5336				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5337				return B_NOT_ALLOWED;
5338
5339			return vfs_resize_fd_table(get_current_io_context(false),
5340				rlp->rlim_cur);
5341
5342		case RLIMIT_NOVMON:
5343			/* TODO: check getuid() */
5344			if (rlp->rlim_max != RLIM_SAVED_MAX
5345				&& rlp->rlim_max != MAX_NODE_MONITORS)
5346				return B_NOT_ALLOWED;
5347
5348			return resize_monitor_table(get_current_io_context(false),
5349				rlp->rlim_cur);
5350
5351		default:
5352			return B_BAD_VALUE;
5353	}
5354}
5355
5356
5357status_t
5358vfs_init(kernel_args* args)
5359{
5360	vnode::StaticInit();
5361
5362	sVnodeTable = new(std::nothrow) VnodeTable();
5363	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5364		panic("vfs_init: error creating vnode hash table\n");
5365
5366	struct vnode dummy_vnode;
5367	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5368
5369	struct fs_mount dummyMount;
5370	sMountsTable = new(std::nothrow) MountTable();
5371	if (sMountsTable == NULL
5372			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5373		panic("vfs_init: error creating mounts hash table\n");
5374
5375	sPathNameCache = create_object_cache("vfs path names",
5376		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5377	if (sPathNameCache == NULL)
5378		panic("vfs_init: error creating path name object_cache\n");
5379
5380	sFileDescriptorCache = create_object_cache("vfs fds",
5381		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5382	if (sFileDescriptorCache == NULL)
5383		panic("vfs_init: error creating file descriptor object_cache\n");
5384
5385	node_monitor_init();
5386
5387	sRoot = NULL;
5388
5389	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5390
5391	if (block_cache_init() != B_OK)
5392		return B_ERROR;
5393
5394#ifdef ADD_DEBUGGER_COMMANDS
5395	// add some debugger commands
5396	add_debugger_command_etc("vnode", &dump_vnode,
5397		"Print info about the specified vnode",
5398		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5399		"Prints information about the vnode specified by address <vnode> or\n"
5400		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5401		"constructed and printed. It might not be possible to construct a\n"
5402		"complete path, though.\n",
5403		0);
5404	add_debugger_command("vnodes", &dump_vnodes,
5405		"list all vnodes (from the specified device)");
5406	add_debugger_command("vnode_caches", &dump_vnode_caches,
5407		"list all vnode caches");
5408	add_debugger_command("mount", &dump_mount,
5409		"info about the specified fs_mount");
5410	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5411	add_debugger_command("io_context", &dump_io_context,
5412		"info about the I/O context");
5413	add_debugger_command("vnode_usage", &dump_vnode_usage,
5414		"info about vnode usage");
5415#endif
5416
5417	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5418		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5419			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5420		0);
5421
5422	fifo_init();
5423	file_map_init();
5424
5425	return file_cache_init();
5426}
5427
5428
5429//	#pragma mark - fd_ops implementations
5430
5431
5432/*!
5433	Calls fs_open() on the given vnode and returns a new
5434	file descriptor for it
5435*/
5436static int
5437open_vnode(struct vnode* vnode, int openMode, bool kernel)
5438{
5439	void* cookie;
5440	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5441	if (status != B_OK)
5442		return status;
5443
5444	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5445	if (fd < 0) {
5446		FS_CALL(vnode, close, cookie);
5447		FS_CALL(vnode, free_cookie, cookie);
5448	}
5449	return fd;
5450}
5451
5452
5453/*!
5454	Calls fs_open() on the given vnode and returns a new
5455	file descriptor for it
5456*/
5457static int
5458create_vnode(struct vnode* directory, const char* name, int openMode,
5459	int perms, bool kernel)
5460{
5461	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5462	status_t status = B_ERROR;
5463	struct vnode* vnode;
5464	void* cookie;
5465	ino_t newID;
5466
5467	// This is somewhat tricky: If the entry already exists, the FS responsible
5468	// for the directory might not necessarily also be the one responsible for
5469	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5470	// we can actually never call the create() hook without O_EXCL. Instead we
5471	// try to look the entry up first. If it already exists, we just open the
5472	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5473	// introduces a race condition, since someone else might have created the
5474	// entry in the meantime. We hope the respective FS returns the correct
5475	// error code and retry (up to 3 times) again.
5476
5477	for (int i = 0; i < 3 && status != B_OK; i++) {
5478		// look the node up
5479		status = lookup_dir_entry(directory, name, &vnode);
5480		if (status == B_OK) {
5481			VNodePutter putter(vnode);
5482
5483			if ((openMode & O_EXCL) != 0)
5484				return B_FILE_EXISTS;
5485
5486			// If the node is a symlink, we have to follow it, unless
5487			// O_NOTRAVERSE is set.
5488			if (S_ISLNK(vnode->Type()) && traverse) {
5489				putter.Put();
5490				char clonedName[B_FILE_NAME_LENGTH + 1];
5491				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5492						>= B_FILE_NAME_LENGTH) {
5493					return B_NAME_TOO_LONG;
5494				}
5495
5496				inc_vnode_ref_count(directory);
5497				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5498					kernel, &vnode, NULL);
5499				if (status != B_OK)
5500					return status;
5501
5502				putter.SetTo(vnode);
5503			}
5504
5505			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5506				return B_LINK_LIMIT;
5507
5508			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5509			// on success keep the vnode reference for the FD
5510			if (fd >= 0)
5511				putter.Detach();
5512
5513			return fd;
5514		}
5515
5516		// it doesn't exist yet -- try to create it
5517
5518		if (!HAS_FS_CALL(directory, create))
5519			return B_READ_ONLY_DEVICE;
5520
5521		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5522			&cookie, &newID);
5523		if (status != B_OK
5524			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5525			return status;
5526		}
5527	}
5528
5529	if (status != B_OK)
5530		return status;
5531
5532	// the node has been created successfully
5533
5534	rw_lock_read_lock(&sVnodeLock);
5535	vnode = lookup_vnode(directory->device, newID);
5536	rw_lock_read_unlock(&sVnodeLock);
5537
5538	if (vnode == NULL) {
5539		panic("vfs: fs_create() returned success but there is no vnode, "
5540			"mount ID %" B_PRIdDEV "!\n", directory->device);
5541		return B_BAD_VALUE;
5542	}
5543
5544	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5545	if (fd >= 0)
5546		return fd;
5547
5548	status = fd;
5549
5550	// something went wrong, clean up
5551
5552	FS_CALL(vnode, close, cookie);
5553	FS_CALL(vnode, free_cookie, cookie);
5554	put_vnode(vnode);
5555
5556	FS_CALL(directory, unlink, name);
5557
5558	return status;
5559}
5560
5561
5562/*! Calls fs open_dir() on the given vnode and returns a new
5563	file descriptor for it
5564*/
5565static int
5566open_dir_vnode(struct vnode* vnode, bool kernel)
5567{
5568	void* cookie;
5569	status_t status = FS_CALL(vnode, open_dir, &cookie);
5570	if (status != B_OK)
5571		return status;
5572
5573	// directory is opened, create a fd
5574	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5575	if (status >= 0)
5576		return status;
5577
5578	FS_CALL(vnode, close_dir, cookie);
5579	FS_CALL(vnode, free_dir_cookie, cookie);
5580
5581	return status;
5582}
5583
5584
5585/*! Calls fs open_attr_dir() on the given vnode and returns a new
5586	file descriptor for it.
5587	Used by attr_dir_open(), and attr_dir_open_fd().
5588*/
5589static int
5590open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5591{
5592	if (!HAS_FS_CALL(vnode, open_attr_dir))
5593		return B_UNSUPPORTED;
5594
5595	void* cookie;
5596	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5597	if (status != B_OK)
5598		return status;
5599
5600	// directory is opened, create a fd
5601	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5602		kernel);
5603	if (status >= 0)
5604		return status;
5605
5606	FS_CALL(vnode, close_attr_dir, cookie);
5607	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5608
5609	return status;
5610}
5611
5612
5613static int
5614file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5615	int openMode, int perms, bool kernel)
5616{
5617	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5618		"kernel %d\n", name, openMode, perms, kernel));
5619
5620	// get directory to put the new file in
5621	struct vnode* directory;
5622	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5623	if (status != B_OK)
5624		return status;
5625
5626	status = create_vnode(directory, name, openMode, perms, kernel);
5627	put_vnode(directory);
5628
5629	return status;
5630}
5631
5632
5633static int
5634file_create(int fd, char* path, int openMode, int perms, bool kernel)
5635{
5636	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5637		openMode, perms, kernel));
5638
5639	// get directory to put the new file in
5640	char name[B_FILE_NAME_LENGTH];
5641	struct vnode* directory;
5642	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5643		kernel);
5644	if (status < 0)
5645		return status;
5646
5647	status = create_vnode(directory, name, openMode, perms, kernel);
5648
5649	put_vnode(directory);
5650	return status;
5651}
5652
5653
5654static int
5655file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5656	int openMode, bool kernel)
5657{
5658	if (name == NULL || *name == '\0')
5659		return B_BAD_VALUE;
5660
5661	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5662		"openMode = %d)\n", mountID, directoryID, name, openMode));
5663
5664	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5665
5666	// get the vnode matching the entry_ref
5667	struct vnode* vnode;
5668	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5669		kernel, &vnode);
5670	if (status != B_OK)
5671		return status;
5672
5673	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5674		put_vnode(vnode);
5675		return B_LINK_LIMIT;
5676	}
5677
5678	int newFD = open_vnode(vnode, openMode, kernel);
5679	if (newFD >= 0) {
5680		// The vnode reference has been transferred to the FD
5681		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5682			directoryID, vnode->id, name);
5683	} else
5684		put_vnode(vnode);
5685
5686	return newFD;
5687}
5688
5689
5690static int
5691file_open(int fd, char* path, int openMode, bool kernel)
5692{
5693	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5694
5695	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5696		fd, path, openMode, kernel));
5697
5698	// get the vnode matching the vnode + path combination
5699	struct vnode* vnode;
5700	ino_t parentID;
5701	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5702		&parentID, kernel);
5703	if (status != B_OK)
5704		return status;
5705
5706	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5707		put_vnode(vnode);
5708		return B_LINK_LIMIT;
5709	}
5710
5711	// open the vnode
5712	int newFD = open_vnode(vnode, openMode, kernel);
5713	if (newFD >= 0) {
5714		// The vnode reference has been transferred to the FD
5715		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5716			vnode->device, parentID, vnode->id, NULL);
5717	} else
5718		put_vnode(vnode);
5719
5720	return newFD;
5721}
5722
5723
5724static status_t
5725file_close(struct file_descriptor* descriptor)
5726{
5727	struct vnode* vnode = descriptor->u.vnode;
5728	status_t status = B_OK;
5729
5730	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5731
5732	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5733		vnode->id);
5734	if (HAS_FS_CALL(vnode, close)) {
5735		status = FS_CALL(vnode, close, descriptor->cookie);
5736	}
5737
5738	if (status == B_OK) {
5739		// remove all outstanding locks for this team
5740		if (HAS_FS_CALL(vnode, release_lock))
5741			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5742		else
5743			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5744	}
5745	return status;
5746}
5747
5748
5749static void
5750file_free_fd(struct file_descriptor* descriptor)
5751{
5752	struct vnode* vnode = descriptor->u.vnode;
5753
5754	if (vnode != NULL) {
5755		FS_CALL(vnode, free_cookie, descriptor->cookie);
5756		put_vnode(vnode);
5757	}
5758}
5759
5760
5761static status_t
5762file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5763	size_t* length)
5764{
5765	struct vnode* vnode = descriptor->u.vnode;
5766	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5767		pos, length, *length));
5768
5769	if (S_ISDIR(vnode->Type()))
5770		return B_IS_A_DIRECTORY;
5771
5772	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5773}
5774
5775
5776static status_t
5777file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5778	size_t* length)
5779{
5780	struct vnode* vnode = descriptor->u.vnode;
5781	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5782		length));
5783
5784	if (S_ISDIR(vnode->Type()))
5785		return B_IS_A_DIRECTORY;
5786	if (!HAS_FS_CALL(vnode, write))
5787		return B_READ_ONLY_DEVICE;
5788
5789	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5790}
5791
5792
5793static off_t
5794file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5795{
5796	struct vnode* vnode = descriptor->u.vnode;
5797	off_t offset;
5798	bool isDevice = false;
5799
5800	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5801		seekType));
5802
5803	// some kinds of files are not seekable
5804	switch (vnode->Type() & S_IFMT) {
5805		case S_IFIFO:
5806		case S_IFSOCK:
5807			return ESPIPE;
5808
5809		// drivers publish block devices as chr, so pick both
5810		case S_IFBLK:
5811		case S_IFCHR:
5812			isDevice = true;
5813			break;
5814		// The Open Group Base Specs don't mention any file types besides pipes,
5815		// fifos, and sockets specially, so we allow seeking them.
5816		case S_IFREG:
5817		case S_IFDIR:
5818		case S_IFLNK:
5819			break;
5820	}
5821
5822	switch (seekType) {
5823		case SEEK_SET:
5824			offset = 0;
5825			break;
5826		case SEEK_CUR:
5827			offset = descriptor->pos;
5828			break;
5829		case SEEK_END:
5830		{
5831			// stat() the node
5832			if (!HAS_FS_CALL(vnode, read_stat))
5833				return B_UNSUPPORTED;
5834
5835			struct stat stat;
5836			status_t status = FS_CALL(vnode, read_stat, &stat);
5837			if (status != B_OK)
5838				return status;
5839
5840			offset = stat.st_size;
5841
5842			if (offset == 0 && isDevice) {
5843				// stat() on regular drivers doesn't report size
5844				device_geometry geometry;
5845
5846				if (HAS_FS_CALL(vnode, ioctl)) {
5847					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5848						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5849					if (status == B_OK)
5850						offset = (off_t)geometry.bytes_per_sector
5851							* geometry.sectors_per_track
5852							* geometry.cylinder_count
5853							* geometry.head_count;
5854				}
5855			}
5856
5857			break;
5858		}
5859		default:
5860			return B_BAD_VALUE;
5861	}
5862
5863	// assumes off_t is 64 bits wide
5864	if (offset > 0 && LONGLONG_MAX - offset < pos)
5865		return B_BUFFER_OVERFLOW;
5866
5867	pos += offset;
5868	if (pos < 0)
5869		return B_BAD_VALUE;
5870
5871	return descriptor->pos = pos;
5872}
5873
5874
5875static status_t
5876file_select(struct file_descriptor* descriptor, uint8 event,
5877	struct selectsync* sync)
5878{
5879	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5880
5881	struct vnode* vnode = descriptor->u.vnode;
5882
5883	// If the FS has no select() hook, notify select() now.
5884	if (!HAS_FS_CALL(vnode, select)) {
5885		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5886			return notify_select_event(sync, event);
5887		else
5888			return B_OK;
5889	}
5890
5891	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5892}
5893
5894
5895static status_t
5896file_deselect(struct file_descriptor* descriptor, uint8 event,
5897	struct selectsync* sync)
5898{
5899	struct vnode* vnode = descriptor->u.vnode;
5900
5901	if (!HAS_FS_CALL(vnode, deselect))
5902		return B_OK;
5903
5904	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5905}
5906
5907
5908static status_t
5909dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5910	bool kernel)
5911{
5912	struct vnode* vnode;
5913	status_t status;
5914
5915	if (name == NULL || *name == '\0')
5916		return B_BAD_VALUE;
5917
5918	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5919		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5920
5921	status = get_vnode(mountID, parentID, &vnode, true, false);
5922	if (status != B_OK)
5923		return status;
5924
5925	if (HAS_FS_CALL(vnode, create_dir))
5926		status = FS_CALL(vnode, create_dir, name, perms);
5927	else
5928		status = B_READ_ONLY_DEVICE;
5929
5930	put_vnode(vnode);
5931	return status;
5932}
5933
5934
5935static status_t
5936dir_create(int fd, char* path, int perms, bool kernel)
5937{
5938	char filename[B_FILE_NAME_LENGTH];
5939	struct vnode* vnode;
5940	status_t status;
5941
5942	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5943		kernel));
5944
5945	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5946	if (status < 0)
5947		return status;
5948
5949	if (HAS_FS_CALL(vnode, create_dir)) {
5950		status = FS_CALL(vnode, create_dir, filename, perms);
5951	} else
5952		status = B_READ_ONLY_DEVICE;
5953
5954	put_vnode(vnode);
5955	return status;
5956}
5957
5958
5959static int
5960dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5961{
5962	FUNCTION(("dir_open_entry_ref()\n"));
5963
5964	if (name && name[0] == '\0')
5965		return B_BAD_VALUE;
5966
5967	// get the vnode matching the entry_ref/node_ref
5968	struct vnode* vnode;
5969	status_t status;
5970	if (name) {
5971		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5972			&vnode);
5973	} else
5974		status = get_vnode(mountID, parentID, &vnode, true, false);
5975	if (status != B_OK)
5976		return status;
5977
5978	int newFD = open_dir_vnode(vnode, kernel);
5979	if (newFD >= 0) {
5980		// The vnode reference has been transferred to the FD
5981		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5982			vnode->id, name);
5983	} else
5984		put_vnode(vnode);
5985
5986	return newFD;
5987}
5988
5989
5990static int
5991dir_open(int fd, char* path, bool kernel)
5992{
5993	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5994		kernel));
5995
5996	// get the vnode matching the vnode + path combination
5997	struct vnode* vnode = NULL;
5998	ino_t parentID;
5999	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
6000		kernel);
6001	if (status != B_OK)
6002		return status;
6003
6004	// open the dir
6005	int newFD = open_dir_vnode(vnode, kernel);
6006	if (newFD >= 0) {
6007		// The vnode reference has been transferred to the FD
6008		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6009			parentID, vnode->id, NULL);
6010	} else
6011		put_vnode(vnode);
6012
6013	return newFD;
6014}
6015
6016
6017static status_t
6018dir_close(struct file_descriptor* descriptor)
6019{
6020	struct vnode* vnode = descriptor->u.vnode;
6021
6022	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
6023
6024	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6025		vnode->id);
6026	if (HAS_FS_CALL(vnode, close_dir))
6027		return FS_CALL(vnode, close_dir, descriptor->cookie);
6028
6029	return B_OK;
6030}
6031
6032
6033static void
6034dir_free_fd(struct file_descriptor* descriptor)
6035{
6036	struct vnode* vnode = descriptor->u.vnode;
6037
6038	if (vnode != NULL) {
6039		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6040		put_vnode(vnode);
6041	}
6042}
6043
6044
6045static status_t
6046dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6047	struct dirent* buffer, size_t bufferSize, uint32* _count)
6048{
6049	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6050		bufferSize, _count);
6051}
6052
6053
6054static status_t
6055fix_dirent(struct vnode* parent, struct dirent* entry,
6056	struct io_context* ioContext)
6057{
6058	// set d_pdev and d_pino
6059	entry->d_pdev = parent->device;
6060	entry->d_pino = parent->id;
6061
6062	// If this is the ".." entry and the directory covering another vnode,
6063	// we need to replace d_dev and d_ino with the actual values.
6064	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6065		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6066			ioContext);
6067	}
6068
6069	// resolve covered vnodes
6070	ReadLocker _(&sVnodeLock);
6071
6072	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6073	if (vnode != NULL && vnode->covered_by != NULL) {
6074		do {
6075			vnode = vnode->covered_by;
6076		} while (vnode->covered_by != NULL);
6077
6078		entry->d_dev = vnode->device;
6079		entry->d_ino = vnode->id;
6080	}
6081
6082	return B_OK;
6083}
6084
6085
6086static status_t
6087dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6088	struct dirent* buffer, size_t bufferSize, uint32* _count)
6089{
6090	if (!HAS_FS_CALL(vnode, read_dir))
6091		return B_UNSUPPORTED;
6092
6093	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6094		_count);
6095	if (error != B_OK)
6096		return error;
6097
6098	// we need to adjust the read dirents
6099	uint32 count = *_count;
6100	for (uint32 i = 0; i < count; i++) {
6101		error = fix_dirent(vnode, buffer, ioContext);
6102		if (error != B_OK)
6103			return error;
6104
6105		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6106	}
6107
6108	return error;
6109}
6110
6111
6112static status_t
6113dir_rewind(struct file_descriptor* descriptor)
6114{
6115	struct vnode* vnode = descriptor->u.vnode;
6116
6117	if (HAS_FS_CALL(vnode, rewind_dir)) {
6118		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6119	}
6120
6121	return B_UNSUPPORTED;
6122}
6123
6124
6125static status_t
6126dir_remove(int fd, char* path, bool kernel)
6127{
6128	char name[B_FILE_NAME_LENGTH];
6129	struct vnode* directory;
6130	status_t status;
6131
6132	if (path != NULL) {
6133		// we need to make sure our path name doesn't stop with "/", ".",
6134		// or ".."
6135		char* lastSlash;
6136		while ((lastSlash = strrchr(path, '/')) != NULL) {
6137			char* leaf = lastSlash + 1;
6138			if (!strcmp(leaf, ".."))
6139				return B_NOT_ALLOWED;
6140
6141			// omit multiple slashes
6142			while (lastSlash > path && lastSlash[-1] == '/')
6143				lastSlash--;
6144
6145			if (leaf[0]
6146				&& strcmp(leaf, ".")) {
6147				break;
6148			}
6149			// "name/" -> "name", or "name/." -> "name"
6150			lastSlash[0] = '\0';
6151		}
6152
6153		if (!strcmp(path, ".") || !strcmp(path, ".."))
6154			return B_NOT_ALLOWED;
6155	}
6156
6157	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6158	if (status != B_OK)
6159		return status;
6160
6161	if (HAS_FS_CALL(directory, remove_dir))
6162		status = FS_CALL(directory, remove_dir, name);
6163	else
6164		status = B_READ_ONLY_DEVICE;
6165
6166	put_vnode(directory);
6167	return status;
6168}
6169
6170
6171static status_t
6172common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6173	size_t length)
6174{
6175	struct vnode* vnode = descriptor->u.vnode;
6176
6177	if (HAS_FS_CALL(vnode, ioctl))
6178		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6179
6180	return B_DEV_INVALID_IOCTL;
6181}
6182
6183
6184static status_t
6185common_fcntl(int fd, int op, size_t argument, bool kernel)
6186{
6187	struct flock flock;
6188
6189	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6190		fd, op, argument, kernel ? "kernel" : "user"));
6191
6192	struct io_context* context = get_current_io_context(kernel);
6193
6194	struct file_descriptor* descriptor = get_fd(context, fd);
6195	if (descriptor == NULL)
6196		return B_FILE_ERROR;
6197
6198	struct vnode* vnode = fd_vnode(descriptor);
6199
6200	status_t status = B_OK;
6201
6202	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6203		if (descriptor->type != FDTYPE_FILE)
6204			status = B_BAD_VALUE;
6205		else if (kernel)
6206			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6207		else if (user_memcpy(&flock, (struct flock*)argument,
6208				sizeof(struct flock)) != B_OK)
6209			status = B_BAD_ADDRESS;
6210		if (status != B_OK) {
6211			put_fd(descriptor);
6212			return status;
6213		}
6214	}
6215
6216	switch (op) {
6217		case F_SETFD:
6218		{
6219			// Set file descriptor flags
6220
6221			// O_CLOEXEC is the only flag available at this time
6222			mutex_lock(&context->io_mutex);
6223			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6224			mutex_unlock(&context->io_mutex);
6225
6226			status = B_OK;
6227			break;
6228		}
6229
6230		case F_GETFD:
6231		{
6232			// Get file descriptor flags
6233			mutex_lock(&context->io_mutex);
6234			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6235			mutex_unlock(&context->io_mutex);
6236			break;
6237		}
6238
6239		case F_SETFL:
6240			// Set file descriptor open mode
6241
6242			// we only accept changes to O_APPEND and O_NONBLOCK
6243			argument &= O_APPEND | O_NONBLOCK;
6244			if (descriptor->ops->fd_set_flags != NULL) {
6245				status = descriptor->ops->fd_set_flags(descriptor, argument);
6246			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6247				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6248					(int)argument);
6249			} else
6250				status = B_UNSUPPORTED;
6251
6252			if (status == B_OK) {
6253				// update this descriptor's open_mode field
6254				descriptor->open_mode = (descriptor->open_mode
6255					& ~(O_APPEND | O_NONBLOCK)) | argument;
6256			}
6257
6258			break;
6259
6260		case F_GETFL:
6261			// Get file descriptor open mode
6262			status = descriptor->open_mode;
6263			break;
6264
6265		case F_DUPFD:
6266		case F_DUPFD_CLOEXEC:
6267		{
6268			status = new_fd_etc(context, descriptor, (int)argument);
6269			if (status >= 0) {
6270				mutex_lock(&context->io_mutex);
6271				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6272				mutex_unlock(&context->io_mutex);
6273
6274				atomic_add(&descriptor->ref_count, 1);
6275			}
6276			break;
6277		}
6278
6279		case F_GETLK:
6280			if (vnode != NULL) {
6281				struct flock normalizedLock;
6282
6283				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6284				status = normalize_flock(descriptor, &normalizedLock);
6285				if (status != B_OK)
6286					break;
6287
6288				if (HAS_FS_CALL(vnode, test_lock)) {
6289					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6290						&normalizedLock);
6291				} else
6292					status = test_advisory_lock(vnode, &normalizedLock);
6293				if (status == B_OK) {
6294					if (normalizedLock.l_type == F_UNLCK) {
6295						// no conflicting lock found, copy back the same struct
6296						// we were given except change type to F_UNLCK
6297						flock.l_type = F_UNLCK;
6298						if (kernel) {
6299							memcpy((struct flock*)argument, &flock,
6300								sizeof(struct flock));
6301						} else {
6302							status = user_memcpy((struct flock*)argument,
6303								&flock, sizeof(struct flock));
6304						}
6305					} else {
6306						// a conflicting lock was found, copy back its range and
6307						// type
6308						if (normalizedLock.l_len == OFF_MAX)
6309							normalizedLock.l_len = 0;
6310
6311						if (kernel) {
6312							memcpy((struct flock*)argument,
6313								&normalizedLock, sizeof(struct flock));
6314						} else {
6315							status = user_memcpy((struct flock*)argument,
6316								&normalizedLock, sizeof(struct flock));
6317						}
6318					}
6319				}
6320			} else
6321				status = B_BAD_VALUE;
6322			break;
6323
6324		case F_SETLK:
6325		case F_SETLKW:
6326			status = normalize_flock(descriptor, &flock);
6327			if (status != B_OK)
6328				break;
6329
6330			if (vnode == NULL) {
6331				status = B_BAD_VALUE;
6332			} else if (flock.l_type == F_UNLCK) {
6333				if (HAS_FS_CALL(vnode, release_lock)) {
6334					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6335						&flock);
6336				} else {
6337					status = release_advisory_lock(vnode, context, NULL,
6338						&flock);
6339				}
6340			} else {
6341				// the open mode must match the lock type
6342				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6343						&& flock.l_type == F_WRLCK)
6344					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6345						&& flock.l_type == F_RDLCK))
6346					status = B_FILE_ERROR;
6347				else {
6348					if (HAS_FS_CALL(vnode, acquire_lock)) {
6349						status = FS_CALL(vnode, acquire_lock,
6350							descriptor->cookie, &flock, op == F_SETLKW);
6351					} else {
6352						status = acquire_advisory_lock(vnode, context, NULL,
6353							&flock, op == F_SETLKW);
6354					}
6355				}
6356			}
6357			break;
6358
6359		// ToDo: add support for more ops?
6360
6361		default:
6362			status = B_BAD_VALUE;
6363	}
6364
6365	put_fd(descriptor);
6366	return status;
6367}
6368
6369
6370static status_t
6371common_sync(int fd, bool kernel)
6372{
6373	struct file_descriptor* descriptor;
6374	struct vnode* vnode;
6375	status_t status;
6376
6377	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6378
6379	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6380	if (descriptor == NULL)
6381		return B_FILE_ERROR;
6382
6383	if (HAS_FS_CALL(vnode, fsync))
6384		status = FS_CALL_NO_PARAMS(vnode, fsync);
6385	else
6386		status = B_UNSUPPORTED;
6387
6388	put_fd(descriptor);
6389	return status;
6390}
6391
6392
6393static status_t
6394common_lock_node(int fd, bool kernel)
6395{
6396	struct file_descriptor* descriptor;
6397	struct vnode* vnode;
6398
6399	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6400	if (descriptor == NULL)
6401		return B_FILE_ERROR;
6402
6403	status_t status = B_OK;
6404
6405	// We need to set the locking atomically - someone
6406	// else might set one at the same time
6407	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6408			(file_descriptor*)NULL) != NULL)
6409		status = B_BUSY;
6410
6411	put_fd(descriptor);
6412	return status;
6413}
6414
6415
6416static status_t
6417common_unlock_node(int fd, bool kernel)
6418{
6419	struct file_descriptor* descriptor;
6420	struct vnode* vnode;
6421
6422	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6423	if (descriptor == NULL)
6424		return B_FILE_ERROR;
6425
6426	status_t status = B_OK;
6427
6428	// We need to set the locking atomically - someone
6429	// else might set one at the same time
6430	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6431			(file_descriptor*)NULL, descriptor) != descriptor)
6432		status = B_BAD_VALUE;
6433
6434	put_fd(descriptor);
6435	return status;
6436}
6437
6438
6439static status_t
6440common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6441	bool kernel)
6442{
6443	struct vnode* vnode;
6444	status_t status;
6445
6446	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6447	if (status != B_OK)
6448		return status;
6449
6450	if (HAS_FS_CALL(vnode, read_symlink)) {
6451		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6452	} else
6453		status = B_BAD_VALUE;
6454
6455	put_vnode(vnode);
6456	return status;
6457}
6458
6459
6460static status_t
6461common_create_symlink(int fd, char* path, const char* toPath, int mode,
6462	bool kernel)
6463{
6464	// path validity checks have to be in the calling function!
6465	char name[B_FILE_NAME_LENGTH];
6466	struct vnode* vnode;
6467	status_t status;
6468
6469	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6470		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6471
6472	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6473	if (status != B_OK)
6474		return status;
6475
6476	if (HAS_FS_CALL(vnode, create_symlink))
6477		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6478	else {
6479		status = HAS_FS_CALL(vnode, write)
6480			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6481	}
6482
6483	put_vnode(vnode);
6484
6485	return status;
6486}
6487
6488
6489static status_t
6490common_create_link(int pathFD, char* path, int toFD, char* toPath,
6491	bool traverseLeafLink, bool kernel)
6492{
6493	// path validity checks have to be in the calling function!
6494
6495	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6496		toPath, kernel));
6497
6498	char name[B_FILE_NAME_LENGTH];
6499	struct vnode* directory;
6500	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6501		kernel);
6502	if (status != B_OK)
6503		return status;
6504
6505	struct vnode* vnode;
6506	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6507		kernel);
6508	if (status != B_OK)
6509		goto err;
6510
6511	if (directory->mount != vnode->mount) {
6512		status = B_CROSS_DEVICE_LINK;
6513		goto err1;
6514	}
6515
6516	if (HAS_FS_CALL(directory, link))
6517		status = FS_CALL(directory, link, name, vnode);
6518	else
6519		status = B_READ_ONLY_DEVICE;
6520
6521err1:
6522	put_vnode(vnode);
6523err:
6524	put_vnode(directory);
6525
6526	return status;
6527}
6528
6529
6530static status_t
6531common_unlink(int fd, char* path, bool kernel)
6532{
6533	char filename[B_FILE_NAME_LENGTH];
6534	struct vnode* vnode;
6535	status_t status;
6536
6537	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6538		kernel));
6539
6540	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6541	if (status < 0)
6542		return status;
6543
6544	if (HAS_FS_CALL(vnode, unlink))
6545		status = FS_CALL(vnode, unli