mmap实现分析
mmap实现分析
本文不是介绍mmap函数的使用方法,而是分析其内核实现,相关使用方法网上已经有很多资料。Mmap的本质其实就是:为当前进程分配(或找到)一个合适的vma,然后为该vma设置对应的缺页处理函数。
我们知道mmap按照flag可以分为匿名映射和非匿名映射,又可分为shared映射和private映射。这样从两个维度,我们就得到了四种映射。
(1)匿名shared映射:fd为-1,可用于父子进程通信。
(2)匿名private映射:例如malloc大块的内存(大于128k)。
(3)非匿名shared映射:常见的用于进程通信方式。
(4)非匿名private映射:例如程序在启动时加载so时,就是用的这种方式,相当于“写时拷贝”。
下面我们就看下内核中几种方式的区别。
内核中mmap主要有函数sys_mmap_pgoff函数负责实现,该函数定义在mm/mmap.c中。
- SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
- unsigned long, prot, unsigned long, flags,
- unsigned long, fd, unsigned long, pgoff)
- {
- struct file *file = NULL;
- unsigned long retval = -EBADF;
- if (!(flags & MAP_ANONYMOUS)) { /*匿名映射*/
- audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
- file = fget(fd); /*由fd找到对应的file结构*/
- if (!file)
- goto out;
- if (is_file_hugepages(file))
- len = ALIGN(len, huge_page_size(hstate_file(file)));
- } else if (flags & MAP_HUGETLB) {
- /*......*/
- }
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- if (file)
- fput(file);
- out:
- return retval;
- }
该函数主要功能由vm_mmap_pgoff来实现,而vm_mmap_pgoff主要逻辑就是调用了do_mmap_pgoff。下面我们看vm_mmap_pgoff的实现。
ldo_mmap_pgoff
- unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff,
- unsigned long *populate)
- {
- struct mm_struct * mm = current->mm;
- struct inode *inode;
- /*......*/
- /* Obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space.
- */
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
- return addr;
- /*......*/
- addr = mmap_region(file, addr, len, vm_flags, pgoff);
- /*......*/
- return addr;
- }
这个函数首先通过 get_unmapped_area创建(或获取)一个合适的vma,然后调用mmap_region对vma进行设置。我们具体看下mmap_region的实现。
lmmap_region
- unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
- {
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- int correct_wcount = 0;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
- struct inode *inode = file ? file_inode(file) : NULL;
- /*......*/
- if (file) { /*如果不是匿名映射*/
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
- if (vm_flags & VM_DENYWRITE) {
- error = deny_write_access(file);
- if (error)
- goto free_vma;
- correct_wcount = 1;
- }
- vma->vm_file = get_file(file);
- error = file->f_op->mmap(file, vma); /*调用对应文件系统的mmap函数*/
- if (error)
- goto unmap_and_free_vma;
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) { /*shared 匿名映射*/
- if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
- goto free_vma;
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- } /*private 匿名映射*/
- file = vma->vm_file;
- /*......*/
- }
如果传入了fd,则调用对应文件系统的mmap函数。以ext4文件系统为例。其mmap函数为 ext4_file_mmap。
lext4_file_mmap
- static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
- {
- struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
- return -ENOEXEC;
- file_accessed(file);
- vma->vm_ops = &ext4_file_vm_ops;
- return 0;
- }
可以看到这个函数只是设置vma->vm_ops为当前文件系统的处理函数。
- static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
- .page_mkwrite = ext4_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
- };
如果是匿名映射(不传入fd),且传入了shared flag。则调用shmem_zero_setup。
lshmem_zero_setup
- int shmem_zero_setup(struct vm_area_struct *vma)
- {
- struct file *file;
- loff_t size = vma->vm_end - vma->vm_start;
- file = shmem_file_setup("dev/zero", size, vma->vm_flags);
- if (IS_ERR(file))
- return PTR_ERR(file);
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
- return 0;
- }
可以看到这里将vma->vm_ops设置为tmpfs文件系统的shmem_vm_ops。
- static const struct vm_operations_struct shmem_vm_ops = {
- .fault = shmem_fault,
- #ifdef CONFIG_NUMA
- .set_policy = shmem_set_policy,
- .get_policy = shmem_get_policy,
- #endif
- .remap_pages = generic_file_remap_pages,
- };
整个mmap函数的处理过程如下:
我们知道mmap函数只是为进程分配了虚拟内存空间,并没有真的建立虚拟内存和物理内存的映射。这个建立映射的过程是到缺页中断的函数中进行的。
缺页中断的处理过程大体如下: