/* * A Simple Filesystem for the Linux Kernel. * * Initial author: Sankar P * License: Creative Commons Zero License - http://creativecommons.org/publicdomain/zero/1.0/ */ #include #include #include #include #include #include #include "super.h" /* A super block lock that must be used for any critical section operation on the sb, * such as: updating the free_blocks, inodes_count etc. */ static DEFINE_MUTEX(simplefs_sb_lock); static DEFINE_MUTEX(simplefs_inodes_mgmt_lock); /* FIXME: This can be moved to an in-memory structure of the simplefs_inode. * Because of the global nature of this lock, we cannot create * new children (without locking) in two different dirs at a time. * They will get sequentially created. If we move the lock * to a directory-specific way (by moving it inside inode), the * insertion of two children in two different directories can be * done in parallel */ static DEFINE_MUTEX(simplefs_directory_children_update_lock); void simplefs_sb_sync(struct super_block *vsb) { struct buffer_head *bh; struct simplefs_super_block *sb = SIMPLEFS_SB(vsb); bh = (struct buffer_head *)sb_bread(vsb, SIMPLEFS_SUPERBLOCK_BLOCK_NUMBER); bh->b_data = (char *)sb; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } void simplefs_inode_add(struct super_block *vsb, struct simplefs_inode *inode) { struct simplefs_super_block *sb = SIMPLEFS_SB(vsb); struct buffer_head *bh; struct simplefs_inode *inode_iterator; if (mutex_lock_interruptible(&simplefs_inodes_mgmt_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return; } bh = (struct buffer_head *)sb_bread(vsb, SIMPLEFS_INODESTORE_BLOCK_NUMBER); inode_iterator = (struct simplefs_inode *)bh->b_data; if (mutex_lock_interruptible(&simplefs_sb_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return; } /* Append the new inode in the end in the inode store */ inode_iterator += sb->inodes_count; memcpy(inode_iterator, inode, sizeof(struct simplefs_inode)); sb->inodes_count++; mark_buffer_dirty(bh); simplefs_sb_sync(vsb); brelse(bh); mutex_unlock(&simplefs_sb_lock); mutex_unlock(&simplefs_inodes_mgmt_lock); } /* This function returns a blocknumber which is free. * The block will be removed from the freeblock list. * * In an ideal, production-ready filesystem, we will not be dealing with blocks, * and instead we will be using extents * * If for some reason, the file creation/deletion failed, the block number * will still be marked as non-free. You need fsck to fix this.*/ int simplefs_sb_get_a_freeblock(struct super_block *vsb, uint64_t * out) { struct simplefs_super_block *sb = SIMPLEFS_SB(vsb); int i; int ret = 0; if (mutex_lock_interruptible(&simplefs_sb_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); ret = -EINTR; goto end; } /* Loop until we find a free block. We start the loop from 3, * as all prior blocks will always be in use */ for (i = 3; i < SIMPLEFS_MAX_FILESYSTEM_OBJECTS_SUPPORTED; i++) { if (sb->free_blocks & (1 << i)) { break; } } if (unlikely(i == SIMPLEFS_MAX_FILESYSTEM_OBJECTS_SUPPORTED)) { printk(KERN_ERR "No more free blocks available"); ret = -ENOSPC; goto end; } *out = i; /* Remove the identified block from the free list */ sb->free_blocks &= ~(1 << i); simplefs_sb_sync(vsb); end: mutex_unlock(&simplefs_sb_lock); return ret; } static int simplefs_sb_get_objects_count(struct super_block *vsb, uint64_t * out) { struct simplefs_super_block *sb = SIMPLEFS_SB(vsb); if (mutex_lock_interruptible(&simplefs_inodes_mgmt_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return -EINTR; } *out = sb->inodes_count; mutex_unlock(&simplefs_inodes_mgmt_lock); return 0; } static int simplefs_readdir(struct file *filp, void *dirent, filldir_t filldir) { loff_t pos; struct inode *inode; struct super_block *sb; struct buffer_head *bh; struct simplefs_inode *sfs_inode; struct simplefs_dir_record *record; int i; pos = filp->f_pos; inode = filp->f_dentry->d_inode; sb = inode->i_sb; if (pos) { /* FIXME: We use a hack of reading pos to figure if we have filled in all data. * We should probably fix this to work in a cursor based model and * use the tokens correctly to not fill too many data in each cursor based call */ return 0; } sfs_inode = SIMPLEFS_INODE(inode); if (unlikely(!S_ISDIR(sfs_inode->mode))) { printk(KERN_ERR "inode [%llu][%lu] for fs object [%s] not a directory\n", sfs_inode->inode_no, inode->i_ino, filp->f_dentry->d_name.name); return -ENOTDIR; } bh = (struct buffer_head *)sb_bread(sb, sfs_inode->data_block_number); record = (struct simplefs_dir_record *)bh->b_data; for (i = 0; i < sfs_inode->dir_children_count; i++) { filldir(dirent, record->filename, SIMPLEFS_FILENAME_MAXLEN, pos, record->inode_no, DT_UNKNOWN); filp->f_pos += sizeof(struct simplefs_dir_record); pos += sizeof(struct simplefs_dir_record); record++; } brelse(bh); return 0; } /* This functions returns a simplefs_inode with the given inode_no * from the inode store, if it exists. */ struct simplefs_inode *simplefs_get_inode(struct super_block *sb, uint64_t inode_no) { struct simplefs_super_block *sfs_sb = SIMPLEFS_SB(sb); struct simplefs_inode *sfs_inode = NULL; int i; struct buffer_head *bh; /* The inode store can be read once and kept in memory permanently while mounting. * But such a model will not be scalable in a filesystem with * millions or billions of files (inodes) */ bh = (struct buffer_head *)sb_bread(sb, SIMPLEFS_INODESTORE_BLOCK_NUMBER); sfs_inode = (struct simplefs_inode *)bh->b_data; #if 0 if (mutex_lock_interruptible(&simplefs_inodes_mgmt_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return NULL; } #endif for (i = 0; i < sfs_sb->inodes_count; i++) { if (sfs_inode->inode_no == inode_no) { /* FIXME: bh->b_data is probably leaking */ return sfs_inode; } sfs_inode++; } // mutex_unlock(&simplefs_inodes_mgmt_lock); return NULL; } ssize_t simplefs_read(struct file * filp, char __user * buf, size_t len, loff_t * ppos) { /* Hack to make sure that we answer the read call only once and not loop infinitely. * We need to implement support for filesize in inode to remove this hack */ static int done = 0; /* After the commit dd37978c5 in the upstream linux kernel, * we can use just filp->f_inode instead of the * f->f_path.dentry->d_inode redirection */ struct simplefs_inode *inode = SIMPLEFS_INODE(filp->f_path.dentry->d_inode); struct buffer_head *bh; char *buffer; int nbytes; if (done) { done = 0; return 0; } if (*ppos >= inode->file_size) { /* Read request with offset beyond the filesize */ return 0; } bh = (struct buffer_head *)sb_bread(filp->f_path.dentry->d_inode->i_sb, inode->data_block_number); if (!bh) { printk(KERN_ERR "Reading the block number [%llu] failed.", inode->data_block_number); return 0; } buffer = (char *)bh->b_data; nbytes = min((size_t) inode->file_size, len); if (copy_to_user(buf, buffer, nbytes)) { brelse(bh); printk(KERN_ERR "Error copying file contents to the userspace buffer\n"); return -EFAULT; } brelse(bh); *ppos += nbytes; done = 1; return nbytes; } /* FIXME: The write support is rudimentary. I have not figured out a way to do writes * from particular offsets (even though I have written some untested code for this below) efficiently. */ ssize_t simplefs_write(struct file * filp, const char __user * buf, size_t len, loff_t * ppos) { /* After the commit dd37978c5 in the upstream linux kernel, * we can use just filp->f_inode instead of the * f->f_path.dentry->d_inode redirection */ struct inode *inode; struct simplefs_inode *sfs_inode; struct simplefs_inode *inode_iterator; struct buffer_head *bh; struct super_block *sb; char *buffer; int count; inode = filp->f_path.dentry->d_inode; sfs_inode = SIMPLEFS_INODE(inode); sb = inode->i_sb; if (*ppos + len >= SIMPLEFS_DEFAULT_BLOCK_SIZE) { printk(KERN_ERR "File size write will exceed a block"); return -ENOSPC; } bh = (struct buffer_head *)sb_bread(filp->f_path.dentry->d_inode->i_sb, sfs_inode->data_block_number); if (!bh) { printk(KERN_ERR "Reading the block number [%llu] failed.", sfs_inode->data_block_number); return 0; } buffer = (char *)bh->b_data; /* Move the pointer until the required byte offset */ buffer += *ppos; if (copy_from_user(buffer, buf, len)) { brelse(bh); printk(KERN_ERR "Error copying file contents from the userspace buffer to the kernel space\n"); return -EFAULT; } *ppos += len; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); /* Set new size * sfs_inode->file_size = max(sfs_inode->file_size, *ppos); * * FIXME: What to do if someone writes only some parts in between ? * The above code will also fail in case a file is overwritten with * a shorter buffer */ if (mutex_lock_interruptible(&simplefs_inodes_mgmt_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return -EINTR; } /* Save the modified inode */ bh = (struct buffer_head *)sb_bread(sb, SIMPLEFS_INODESTORE_BLOCK_NUMBER); sfs_inode->file_size = *ppos; inode_iterator = (struct simplefs_inode *)bh->b_data; if (mutex_lock_interruptible(&simplefs_sb_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return -EINTR; } count = 0; while (inode_iterator->inode_no != sfs_inode->inode_no && count < SIMPLEFS_SB(sb)->inodes_count) { count++; inode_iterator++; } if (likely(count < SIMPLEFS_SB(sb)->inodes_count)) { inode_iterator->file_size = sfs_inode->file_size; printk(KERN_INFO "The new filesize that is written is: [%llu] and len was: [%lu]\n", sfs_inode->file_size, len); mark_buffer_dirty(bh); sync_dirty_buffer(bh); } else { printk(KERN_ERR "The new filesize could not be stored to the inode."); len = -EIO; } brelse(bh); mutex_unlock(&simplefs_sb_lock); mutex_unlock(&simplefs_inodes_mgmt_lock); return len; } const struct file_operations simplefs_file_operations = { .read = simplefs_read, .write = simplefs_write, }; const struct file_operations simplefs_dir_operations = { .owner = THIS_MODULE, .readdir = simplefs_readdir, }; struct dentry *simplefs_lookup(struct inode *parent_inode, struct dentry *child_dentry, unsigned int flags); static int simplefs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); static int simplefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct inode_operations simplefs_inode_ops = { .create = simplefs_create, .lookup = simplefs_lookup, .mkdir = simplefs_mkdir, }; static int simplefs_create_fs_object(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct simplefs_inode *sfs_inode; struct simplefs_inode *inode_iterator; struct super_block *sb; struct simplefs_dir_record *record; struct simplefs_inode *parent_dir_inode; struct buffer_head *bh; struct simplefs_dir_record *dir_contents_datablock; uint64_t count; int ret; if (mutex_lock_interruptible(&simplefs_directory_children_update_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return -EINTR; } sb = dir->i_sb; ret = simplefs_sb_get_objects_count(sb, &count); if (ret < 0) { mutex_unlock(&simplefs_directory_children_update_lock); return ret; } if (unlikely(count >= SIMPLEFS_MAX_FILESYSTEM_OBJECTS_SUPPORTED)) { /* The above condition can be just == insted of the >= */ printk(KERN_ERR "Maximum number of objects supported by simplefs is already reached"); mutex_unlock(&simplefs_directory_children_update_lock); return -ENOSPC; } if (!S_ISDIR(mode) && !S_ISREG(mode)) { printk(KERN_ERR "Creation request but for neither a file nor a directory"); mutex_unlock(&simplefs_directory_children_update_lock); return -EINVAL; } inode = new_inode(sb); if (!inode) { mutex_unlock(&simplefs_directory_children_update_lock); return -ENOMEM; } inode->i_sb = sb; inode->i_op = &simplefs_inode_ops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_ino = 10; /* Loop until we get an unique inode number */ while (simplefs_get_inode(sb, inode->i_ino)) { /* inode inode->i_ino already exists */ inode->i_ino++; } /* FIXME: This is leaking. We need to free all in-memory inodes sometime */ sfs_inode = kmalloc(sizeof(struct simplefs_inode), GFP_KERNEL); sfs_inode->inode_no = inode->i_ino; inode->i_private = sfs_inode; sfs_inode->mode = mode; if (S_ISDIR(mode)) { printk(KERN_INFO "New directory creation request\n"); sfs_inode->dir_children_count = 0; inode->i_fop = &simplefs_dir_operations; } else if (S_ISREG(mode)) { printk(KERN_INFO "New file creation request\n"); sfs_inode->file_size = 0; inode->i_fop = &simplefs_file_operations; } /* First get a free block and update the free map, * Then add inode to the inode store and update the sb inodes_count, * Then update the parent directory's inode with the new child. * * The above ordering helps us to maintain fs consistency * even in most crashes */ ret = simplefs_sb_get_a_freeblock(sb, &sfs_inode->data_block_number); if (ret < 0) { printk(KERN_ERR "simplefs could not get a freeblock"); mutex_unlock(&simplefs_directory_children_update_lock); return ret; } simplefs_inode_add(sb, sfs_inode); record = kmalloc(sizeof(struct simplefs_dir_record), GFP_KERNEL); record->inode_no = sfs_inode->inode_no; strcpy(record->filename, dentry->d_name.name); parent_dir_inode = SIMPLEFS_INODE(dir); bh = sb_bread(sb, parent_dir_inode->data_block_number); dir_contents_datablock = (struct simplefs_dir_record *)bh->b_data; /* Navigate to the last record in the directory contents */ dir_contents_datablock += parent_dir_inode->dir_children_count; memcpy(dir_contents_datablock, record, sizeof(struct simplefs_dir_record)); mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); if (mutex_lock_interruptible(&simplefs_inodes_mgmt_lock)) { mutex_unlock(&simplefs_directory_children_update_lock); printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return -EINTR; } bh = (struct buffer_head *)sb_bread(sb, SIMPLEFS_INODESTORE_BLOCK_NUMBER); inode_iterator = (struct simplefs_inode *)bh->b_data; if (mutex_lock_interruptible(&simplefs_sb_lock)) { printk(KERN_ERR "Failed to acquire mutex lock %s +%d\n", __FILE__, __LINE__); return -EINTR; } count = 0; while (inode_iterator->inode_no != parent_dir_inode->inode_no && count < SIMPLEFS_SB(sb)->inodes_count) { count++; inode_iterator++; } if (likely(inode_iterator->inode_no == parent_dir_inode->inode_no)) { parent_dir_inode->dir_children_count++; inode_iterator->dir_children_count = parent_dir_inode->dir_children_count; /* Updated the parent inode's dir count to reflect the new child too */ mark_buffer_dirty(bh); sync_dirty_buffer(bh); } else { printk(KERN_ERR "The updated childcount could not be stored to the dir inode."); /* TODO: Remove the newly created inode from the disk and in-memory inode store * and also update the superblock, freemaps etc. to reflect the same. * Basically, Undo all actions done during this create call */ } brelse(bh); mutex_unlock(&simplefs_sb_lock); mutex_unlock(&simplefs_inodes_mgmt_lock); mutex_unlock(&simplefs_directory_children_update_lock); inode_init_owner(inode, dir, mode); d_add(dentry, inode); return 0; } static int simplefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { /* I believe this is a bug in the kernel, for some reason, the mkdir callback * does not get the S_IFDIR flag set. Even ext2 sets is explicitly */ return simplefs_create_fs_object(dir, dentry, S_IFDIR | mode); } static int simplefs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return simplefs_create_fs_object(dir, dentry, mode); } struct dentry *simplefs_lookup(struct inode *parent_inode, struct dentry *child_dentry, unsigned int flags) { struct simplefs_inode *parent = SIMPLEFS_INODE(parent_inode); struct super_block *sb = parent_inode->i_sb; struct buffer_head *bh; struct simplefs_dir_record *record; int i; bh = (struct buffer_head *)sb_bread(sb, parent->data_block_number); record = (struct simplefs_dir_record *)bh->b_data; for (i = 0; i < parent->dir_children_count; i++) { if (!strcmp(record->filename, child_dentry->d_name.name)) { /* FIXME: There is a corner case where if an allocated inode, * is not written to the inode store, but the inodes_count is * incremented. Then if the random string on the disk matches * with the filename that we are comparing above, then we * will use an invalid unintialized inode */ struct inode *inode; struct simplefs_inode *sfs_inode; /* FIXME: This simplefs_inode is leaking */ sfs_inode = simplefs_get_inode(sb, record->inode_no); /* FIXME: This inode is leaking */ inode = new_inode(sb); inode->i_ino = record->inode_no; inode_init_owner(inode, parent_inode, sfs_inode->mode); inode->i_sb = sb; inode->i_op = &simplefs_inode_ops; if (S_ISDIR(inode->i_mode)) inode->i_fop = &simplefs_dir_operations; else if (S_ISREG(inode->i_mode)) inode->i_fop = &simplefs_file_operations; else printk(KERN_ERR "Unknown inode type. Neither a directory nor a file"); /* FIXME: We should store these times to disk and retrieve them */ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_private = sfs_inode; d_add(child_dentry, inode); return NULL; } record++; } printk(KERN_ERR "No inode found for the filename [%s]\n", child_dentry->d_name.name); return NULL; } /* This function, as the name implies, Makes the super_block valid and * fills filesystem specific information in the super block */ int simplefs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *root_inode; struct buffer_head *bh; struct simplefs_super_block *sb_disk; bh = (struct buffer_head *)sb_bread(sb, SIMPLEFS_SUPERBLOCK_BLOCK_NUMBER); sb_disk = (struct simplefs_super_block *)bh->b_data; /* FIXME: bh->b_data is probably leaking */ printk(KERN_INFO "The magic number obtained in disk is: [%llu]\n", sb_disk->magic); if (unlikely(sb_disk->magic != SIMPLEFS_MAGIC)) { printk(KERN_ERR "The filesystem that you try to mount is not of type simplefs. Magicnumber mismatch."); return -EPERM; } if (unlikely(sb_disk->block_size != SIMPLEFS_DEFAULT_BLOCK_SIZE)) { printk(KERN_ERR "simplefs seem to be formatted using a non-standard block size."); return -EPERM; } printk(KERN_INFO "simplefs filesystem of version [%llu] formatted with a block size of [%llu] detected in the device.\n", sb_disk->version, sb_disk->block_size); /* A magic number that uniquely identifies our filesystem type */ sb->s_magic = SIMPLEFS_MAGIC; /* For all practical purposes, we will be using this s_fs_info as the super block */ sb->s_fs_info = sb_disk; root_inode = new_inode(sb); root_inode->i_ino = SIMPLEFS_ROOTDIR_INODE_NUMBER; inode_init_owner(root_inode, NULL, S_IFDIR); root_inode->i_sb = sb; root_inode->i_op = &simplefs_inode_ops; root_inode->i_fop = &simplefs_dir_operations; root_inode->i_atime = root_inode->i_mtime = root_inode->i_ctime = CURRENT_TIME; root_inode->i_private = simplefs_get_inode(sb, SIMPLEFS_ROOTDIR_INODE_NUMBER); sb->s_root = d_make_root(root_inode); if (!sb->s_root) return -ENOMEM; return 0; } static struct dentry *simplefs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct dentry *ret; ret = mount_bdev(fs_type, flags, dev_name, data, simplefs_fill_super); if (unlikely(IS_ERR(ret))) printk(KERN_ERR "Error mounting simplefs"); else printk(KERN_INFO "simplefs is succesfully mounted on [%s]\n", dev_name); return ret; } static void simplefs_kill_superblock(struct super_block *s) { printk(KERN_INFO "simplefs superblock is destroyed. Unmount succesful.\n"); /* This is just a dummy function as of now. As our filesystem gets matured, * we will do more meaningful operations here */ return; } struct file_system_type simplefs_fs_type = { .owner = THIS_MODULE, .name = "simplefs", .mount = simplefs_mount, .kill_sb = simplefs_kill_superblock, }; static int simplefs_init(void) { int ret; ret = register_filesystem(&simplefs_fs_type); if (likely(ret == 0)) printk(KERN_INFO "Sucessfully registered simplefs\n"); else printk(KERN_ERR "Failed to register simplefs. Error:[%d]", ret); return ret; } static void simplefs_exit(void) { int ret; ret = unregister_filesystem(&simplefs_fs_type); if (likely(ret == 0)) printk(KERN_INFO "Sucessfully unregistered simplefs\n"); else printk(KERN_ERR "Failed to unregister simplefs. Error:[%d]", ret); } module_init(simplefs_init); module_exit(simplefs_exit); MODULE_LICENSE("CC0"); MODULE_AUTHOR("Sankar P");