linux内核VFS之write

今天主要看下VFS的write()函数逻辑是怎么实现的，write()函数在内核里的定义入口文件在fs/read_write.c，具体如下：

SYSCALL_DEFINE3(write, unsigned int, fd,const char __user *, buf,
              size_t,count)
{
       returnksys_write(fd, buf, count);
}
 
ssize_t ksys_write(unsigned int fd, constchar __user *buf, size_t count)
{
       structfd f = fdget_pos(fd); // 根据文件描述符返回对应的file信息(struct fd包含了file)
       ssize_tret = -EBADF;
 
       if(f.file) {
              loff_tpos, *ppos = file_ppos(f.file);
              if(ppos) {
                     pos= *ppos;
                     ppos= &pos;
              }
              ret= vfs_write(f.file, buf, count, ppos);//写文件
              if(ret >= 0 && ppos)
                     f.file->f_pos= pos;
              fdput_pos(f);
       }
 
       returnret;
}

接下来重点看下vfs_write()函数：

ssize_t vfs_write(struct file *file, constchar __user *buf, size_t count, loff_t *pos)
{
       ssize_tret;
 
       if(!(file->f_mode & FMODE_WRITE)) // 检查文件mode
              return-EBADF;
       if(!(file->f_mode & FMODE_CAN_WRITE))
              return-EINVAL;
       if(unlikely(!access_ok(buf, count)))
              return-EFAULT;
 
       ret= rw_verify_area(WRITE, file, pos, count);//对文件锁的校验(主要包括冲突、死锁校验)
       if(!ret) {
              if(count > MAX_RW_COUNT)
                     count=  MAX_RW_COUNT;
              file_start_write(file);//判断文件系统等级（未冻结、冻结写、冻结页缺失、冻结、完全冻结），并将写者加一
              ret= __vfs_write(file, buf, count, pos); //调用对应文件系统执行写操作
              if(ret > 0) {
                     fsnotify_modify(file);
                     add_wchar(current,ret);
              }
              inc_syscw(current);//当前进程的系统调+1
              file_end_write(file);
       }
 
       returnret;
}

rw_verify_area()函数主要对待写区域进行检查，包括加锁、死锁检测等；__vfs_write()会调用对应file的wirte操作，wirte操作是在文件系统初始化的时候设置的回调函数，针对不同文件系统在初始化的设置会设置对应的回调函数，例如xfs，对应的回调函数在文件fs/xfs/xfs_file.c：

const struct file_operationsxfs_file_operations = {
       .llseek            = xfs_file_llseek,
       .read_iter= xfs_file_read_iter,
       .write_iter       = xfs_file_write_iter,
       .splice_read    = generic_file_splice_read,
       .splice_write   = iter_file_splice_write,
       .iopoll            = iomap_dio_iopoll,
       .unlocked_ioctl      = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
       .compat_ioctl  = xfs_file_compat_ioctl,
#endif
       .mmap           = xfs_file_mmap,
       .mmap_supported_flags= MAP_SYNC,
       .open             = xfs_file_open,
       .release   = xfs_file_release,
       .fsync             = xfs_file_fsync,
       .get_unmapped_area= thp_get_unmapped_area,
       .fallocate= xfs_file_fallocate,
       .fadvise   = xfs_file_fadvise,
       .remap_file_range= xfs_file_remap_range,
};

其实，我在之前的文章linux内核VFS之read有使用xfs的read进行阐述，这次write也使用xfs进行阐述。

下面首先详细介绍函数rw_verify_area():

int rw_verify_area(int read_write, structfile *file, const loff_t *ppos, size_t count)
{
       structinode *inode;
       intretval = -EINVAL;
 
       inode= file_inode(file);//返回struct file对应的struct inode
       if(unlikely((ssize_t) count < 0))
              returnretval;
 
       /*
        * ranged mandatory locking does not apply tostreams - it makes sense
        * only for files where position has a meaning.
        */
       if(ppos) {
              loff_tpos = *ppos;
 
              if(unlikely(pos < 0)) {
                     if(!unsigned_offsets(file))
                            returnretval;
                     if(count >= -pos) /* both values are in 0..LLONG_MAX */
                            return-EOVERFLOW;
              }else if (unlikely((loff_t) (pos + count) < 0)) {
                     if(!unsigned_offsets(file))
                            returnretval;
              }
 
              if(unlikely(inode->i_flctx && mandatory_lock(inode))) {
                     retval= locks_mandatory_area(inode, file, pos, pos + count - 1,
                                   read_write== READ ? F_RDLCK : F_WRLCK);//对待操作区域[pos,pos+count-1]进行加锁
                     if(retval < 0)
                            returnretval;
              }
       }
 
       returnsecurity_file_permission(file,
                            read_write== READ ? MAY_READ : MAY_WRITE);
}

重点看locks_mandatory_area()函数是如何对待操作区域进行加锁的。

int locks_mandatory_area(struct inode*inode, struct file *filp, loff_t start,
                      loff_t end, unsigned char type)
{
       structfile_lock fl;
       interror;
       boolsleep = false;
 
       locks_init_lock(&fl);// 初始化file_lock
       fl.fl_pid= current->tgid;//当前线程id
       fl.fl_file= filp;
       fl.fl_flags= FL_POSIX | FL_ACCESS;
       if(filp && !(filp->f_flags & O_NONBLOCK)) // 是否非阻塞模式
              sleep= true; //阻塞模式
       fl.fl_type= type; // type=F_WRLCK=2
       fl.fl_start= start; // 被锁区域的开始位置
       fl.fl_end= end; // 被锁区域的结束位置
 
       for(;;) {
              if(filp) {
                     fl.fl_owner= filp;
                     fl.fl_flags&= ~FL_SLEEP;
                     error= posix_lock_inode(inode, &fl, NULL);
                     if(!error)
                            break;
              }
 
              if(sleep)
                     fl.fl_flags|= FL_SLEEP; //阻塞模式，fl_flags增加FL_SLEEP标识
              fl.fl_owner= current->files;//拥有锁标识的files_struct
              error= posix_lock_inode(inode, &fl, NULL);
              if(error != FILE_LOCK_DEFERRED)
                     break;
              error= wait_event_interruptible(fl.fl_wait, !fl.fl_blocker);
              if(!error) {
                     /*
                      * If we've been sleeping someone might have
                      * changed the permissions behind our back.
                      */
                     if(__mandatory_lock(inode))
                            continue;
              }
 
              break;
       }
       locks_delete_block(&fl);
 
       returnerror;
}
 
static int posix_lock_inode(struct inode*inode, struct file_lock *request,
                         struct file_lock *conflock)
{
       structfile_lock *fl, *tmp;
       structfile_lock *new_fl = NULL;
       structfile_lock *new_fl2 = NULL;
       structfile_lock *left = NULL;
       structfile_lock *right = NULL;
       structfile_lock_context *ctx;
       interror;
       booladded = false;
       LIST_HEAD(dispose);
 
       //分配file_lock_context
       ctx= locks_get_lock_context(inode, request->fl_type); //request->fl_type=2
       if(!ctx)
              return(request->fl_type == F_UNLCK) ? 0 : -ENOMEM;
 
       /*
        * We may need two file_lock structures forthis operation,
        * so we get them in advance to avoid races.
        *
        * In some cases we can be sure, that no newlocks will be needed
        */
       if(!(request->fl_flags & FL_ACCESS) &&
           (request->fl_type != F_UNLCK ||
            request->fl_start != 0 ||request->fl_end != OFFSET_MAX)) {
              new_fl= locks_alloc_lock();
              new_fl2= locks_alloc_lock();
       }
 
       percpu_down_read(&file_rwsem);
       spin_lock(&ctx->flc_lock);
       /*
        * New lock request. Walk all POSIX locks andlook for conflicts. If
        * there are any, either return error or putthe request on the
        * blocker's list of waiters and the globalblocked_hash.
        */
       if(request->fl_type != F_UNLCK) {
              list_for_each_entry(fl,&ctx->flc_posix, fl_list) {
                     if(!posix_locks_conflict(request, fl))//检查request和已有的链表中的file_lock是否有冲突
                            continue;//没冲突则继续遍历
                     if(conflock)
                            locks_copy_conflock(conflock,fl);
                     error= -EAGAIN;
                     if(!(request->fl_flags & FL_SLEEP))
                            gotoout;
                     /*
                      * Deadlock detection and insertion into theblocked
                      * locks list must be done while holding thesame lock!
                      */
                     error= -EDEADLK;
                     spin_lock(&blocked_lock_lock);
                     /*
                      * Ensure that we don't find any locks blockedon this
                      * request during deadlock detection.
                      */
                     __locks_wake_up_blocks(request);
                     if(likely(!posix_locks_deadlock(request, fl))) {
                            error= FILE_LOCK_DEFERRED;
                            __locks_insert_block(fl,request,
                                               posix_locks_conflict);
                     }
                     spin_unlock(&blocked_lock_lock);
                     gotoout;
              }
       }
 
       /*If we're just looking for a conflict, we're done. */
       error= 0;
       if(request->fl_flags & FL_ACCESS)//如果fl_flags按位与not trying to lock, just looking 则转到:out
              gotoout;
 
       /*Find the first old lock with the same owner as the new lock */
       list_for_each_entry(fl,&ctx->flc_posix, fl_list) {
              if(posix_same_owner(request, fl))//检查两个锁的owner是否是同一个
                     break;
       }
 
       /*Process locks with this owner. */
       list_for_each_entry_safe_from(fl,tmp, &ctx->flc_posix, fl_list) {
              if(!posix_same_owner(request, fl))//两个锁的owner是否是同一个
                     break;//不是同一个则跳出
 
              /*Detect adjacent or overlapping regions (if same lock type) */
              if(request->fl_type == fl->fl_type) { //owner一致且锁类型也一致
                     /*In all comparisons of start vs end, use
                      * "start - 1" rather than "end+ 1". If end
                      * is OFFSET_MAX, end + 1 will become negative.
                      */
                     if(fl->fl_end < request->fl_start - 1)
                            continue;
                     /*If the next lock in the list has entirely bigger
                      * addresses than the new one, insert the lockhere.
                      */
                     if(fl->fl_start - 1 > request->fl_end)
                            break;
 
                     /*If we come here, the new and old lock are of the
                      * same type and adjacent or overlapping. Makeone
                      * lock yielding from the lower start addressof both
                      * locks to the higher end address.
                      */
                     //owner一致且锁类型一致，两个锁的覆盖范围又有重叠，则选择start最小的作为起始位置；选择end最大的作为结束位置
                     if(fl->fl_start > request->fl_start)
                            fl->fl_start= request->fl_start;
                     else
                            request->fl_start= fl->fl_start;
                     if(fl->fl_end < request->fl_end)
                            fl->fl_end= request->fl_end;
                     else
                            request->fl_end= fl->fl_end;
                     if(added) {
                            locks_delete_lock_ctx(fl,&dispose);
                            continue;
                     }
                     request= fl;
                     added= true;
              }else { // 处理两个锁类型不一致的情况
                     /*Processing for different lock types is a bit
                      * more complex.
                      */
                     if(fl->fl_end < request->fl_start)//无重叠区域，且request锁住区域在f1区域之前
                            continue;
                     if(fl->fl_start > request->fl_end)//无重叠区域，且request锁住区域在f1区域之后
                            break;
                     if(request->fl_type == F_UNLCK)
                            added= true;
                     if(fl->fl_start < request->fl_start)//有重叠区域，且request锁住区域的起始位置在f1锁住区域之后
                            left= fl;
                     /*If the next lock in the list has a higher end
                      * address than the new one, insert the new onehere.
                      */
                     if(fl->fl_end > request->fl_end) {//有重叠区域，且request锁住区域的结束位置在f1锁住区域之前
                            right= fl;
                            break;
                     }
                     if(fl->fl_start >= request->fl_start) {
                            /*The new lock completely replaces an old
                             * one (This may happen several times).
                             */
                            if(added) {
                                   locks_delete_lock_ctx(fl,&dispose);
                                   continue;
                            }
                            /*
                             * Replace the old lock with new_fl, and
                             * remove the old one. It's safe to do the
                             * insert here since we know that we won't be
                             * using new_fl later, and that the lock is
                             * just replacing an existing lock.
                             */
                            error= -ENOLCK;
                            if(!new_fl)
                                   gotoout;
                            locks_copy_lock(new_fl,request);
                            request= new_fl;
                            new_fl= NULL;
                            locks_insert_lock_ctx(request,&fl->fl_list);
                            locks_delete_lock_ctx(fl,&dispose);
                            added= true;
                     }
              }
       }
 
       /*
        * The above code only modifies existing locksin case of merging or
        * replacing. If new lock(s) need to beinserted all modifications are
        * done below this, so it's safe yet to bailout.
        */
       error= -ENOLCK; /* "no luck" */
       if(right && left == right && !new_fl2)
              gotoout;
 
       error= 0;
       if(!added) {
              if(request->fl_type == F_UNLCK) {
                     if(request->fl_flags & FL_EXISTS)
                            error= -ENOENT;
                     gotoout;
              }
 
              if(!new_fl) {
                     error= -ENOLCK;
                     gotoout;
              }
              locks_copy_lock(new_fl,request);
              locks_move_blocks(new_fl,request);
              locks_insert_lock_ctx(new_fl,&fl->fl_list);
              fl= new_fl;
              new_fl= NULL;
       }
       if(right) {
              if(left == right) {
                     /*The new lock breaks the old one in two pieces,
                      * so we have to use the second new lock.
                      */
                     left= new_fl2;
                     new_fl2= NULL;
                     locks_copy_lock(left,right);
                     locks_insert_lock_ctx(left,&fl->fl_list);
              }
              right->fl_start= request->fl_end + 1;
              locks_wake_up_blocks(right);
       }
       if(left) {
              left->fl_end= request->fl_start - 1;
              locks_wake_up_blocks(left);
       }
 out:
       spin_unlock(&ctx->flc_lock);
       percpu_up_read(&file_rwsem);
       /*
        * Free any unused locks.
        */
       if(new_fl)
              locks_free_lock(new_fl);
       if(new_fl2)
              locks_free_lock(new_fl2);
       locks_dispose_list(&dispose);
       trace_posix_lock_inode(inode,request, error);
 
       returnerror;
}
 
static struct file_lock_context *
locks_get_lock_context(struct inode *inode,int type)
{
       structfile_lock_context *ctx;
 
       /*paired with cmpxchg() below */
       ctx= smp_load_acquire(&inode->i_flctx);
       if(likely(ctx) || type == F_UNLCK)
              gotoout;
       //i_flctx为空，则分配内存
       ctx= kmem_cache_alloc(flctx_cache, GFP_KERNEL);
       if(!ctx)
              gotoout;
 
       spin_lock_init(&ctx->flc_lock);
       INIT_LIST_HEAD(&ctx->flc_flock);
       INIT_LIST_HEAD(&ctx->flc_posix);
       INIT_LIST_HEAD(&ctx->flc_lease);
 
       /*
        * Assign the pointer if it's not alreadyassigned. If it is, then
        * free the context we just allocated.
        */
       if(cmpxchg(&inode->i_flctx, NULL, ctx)) { //inode->i_flctx是NULL，所以将inode->i_flctx指向ctx，并返回NULL
              kmem_cache_free(flctx_cache,ctx);
              ctx= smp_load_acquire(&inode->i_flctx);
       }
out:
       trace_locks_get_lock_context(inode,type, ctx);
       returnctx;
}
 
 
static bool posix_locks_conflict(structfile_lock *caller_fl,
                             struct file_lock *sys_fl)
{
       /*POSIX locks owned by the same process do not conflict with
        * each other.
        */
       if(posix_same_owner(caller_fl, sys_fl)) //锁的拥有者是不是同一个files_struct
              returnfalse;//是同一个files_struct
 
       /*Check whether they overlap */
       if(!locks_overlap(caller_fl, sys_fl)) //校验两个file_lock锁的区域是否有重叠部分
              returnfalse;//没有重叠区域
 
       //caller_fl和sys_fl是不同的files_struct或被锁区域有重叠区域
       returnlocks_conflict(caller_fl, sys_fl);//判断锁的类型，如果是F_WRLCK，则表示有冲突
}
 
static inline int locks_overlap(structfile_lock *fl1, struct file_lock *fl2)
{
       return((fl1->fl_end >= fl2->fl_start) &&
              (fl2->fl_end>= fl1->fl_start));
}
 
/*
 *Check whether two locks have the same owner.
 */
static int posix_same_owner(structfile_lock *fl1, struct file_lock *fl2)
{
       returnfl1->fl_owner == fl2->fl_owner;
}
 
static bool locks_conflict(struct file_lock*caller_fl,
                        struct file_lock *sys_fl)
{
       if(sys_fl->fl_type == F_WRLCK)
              returntrue;
       if(caller_fl->fl_type == F_WRLCK)
              returntrue;
       returnfalse;
}

__vfs_write()函数会调用对应文件系统的file_operations，同样，本文以xfs为例进行阐述。

static ssize_t __vfs_write(struct file*file, const char __user *p,
                        size_t count, loff_t *pos)
{
       if(file->f_op->write) //如果file有对应的write回调函数
              returnfile->f_op->write(file, p, count, pos);
       elseif (file->f_op->write_iter)
              returnnew_sync_write(file, p, count, pos);
       else
              return-EINVAL;
}
static ssize_t new_sync_write(struct file*filp, const char __user *buf, size_t len, loff_t *ppos)
{
       structiovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
       structkiocb kiocb;
       structiov_iter iter;
       ssize_tret;
 
       init_sync_kiocb(&kiocb,filp);
       kiocb.ki_pos= (ppos ? *ppos : 0);
       iov_iter_init(&iter,WRITE, &iov, 1, len);
 
       ret= call_write_iter(filp, &kiocb, &iter);
       BUG_ON(ret== -EIOCBQUEUED);
       if(ret > 0 && ppos)
              *ppos= kiocb.ki_pos;
       returnret;
}
 
static inline ssize_tcall_write_iter(struct file *file, struct kiocb *kio,
                                  struct iov_iter *iter)
{
       returnfile->f_op->write_iter(kio, iter);
}

接下来分析xfs的write_iter()，对应的回调函数是xfs_file_write_iter()。

STATIC ssize_t
xfs_file_write_iter(
       structkiocb           *iocb,
       structiov_iter        *from)
{
       structfile        *file = iocb->ki_filp;
       structaddress_space    *mapping =file->f_mapping;
       structinode          *inode = mapping->host;
       structxfs_inode     *ip = XFS_I(inode);
       ssize_t                   ret;
       size_t                    ocount =iov_iter_count(from);
 
       XFS_STATS_INC(ip->i_mount,xs_write_calls);
 
       if(ocount == 0)
              return0;
 
       if(XFS_FORCED_SHUTDOWN(ip->i_mount))
              return-EIO;
 
       if(IS_DAX(inode))//direct write
              returnxfs_file_dax_write(iocb, from);
 
       if(iocb->ki_flags & IOCB_DIRECT) {
              /*
               * Allow a directio write to fall back to abuffered
               * write *only* in the case that we're doing areflink
               * CoW. In all other directio scenarios we do not
               * allow an operation to fall back to bufferedmode.
               */
              ret= xfs_file_dio_aio_write(iocb, from);
              if(ret != -EREMCHG)
                     returnret;
       }
 
       returnxfs_file_buffered_aio_write(iocb, from);//aio write
}

主要看下xfs_file_buffered_aio_write()函数，这个函数会对buffer操作，而非direct io(不经过buffer/cache层)

xfs_file_buffered_aio_write(
       structkiocb           *iocb,
       structiov_iter        *from)
{
       structfile        *file = iocb->ki_filp;
       structaddress_space    *mapping =file->f_mapping;
       structinode          *inode = mapping->host;
       structxfs_inode     *ip = XFS_I(inode);
       ssize_t                   ret;
       int                 enospc = 0;
       int                 iolock;
 
       if(iocb->ki_flags & IOCB_NOWAIT)
              return-EOPNOTSUPP;
 
write_retry:
       iolock= XFS_IOLOCK_EXCL; //排它锁
       xfs_ilock(ip,iolock);
 
       ret= xfs_file_aio_write_checks(iocb, from, &iolock);
       if(ret)
              gotoout;
 
       /*We can write back this queue in page reclaim */
       current->backing_dev_info= inode_to_bdi(inode);
 
       trace_xfs_file_buffered_write(ip,iov_iter_count(from), iocb->ki_pos);
       ret= iomap_file_buffered_write(iocb, from,
                     &xfs_buffered_write_iomap_ops);//执行write
       if(likely(ret >= 0))
              iocb->ki_pos+= ret;
}
xfs_buffered_write_iomap_ops是个struct:
const struct iomap_ops xfs_buffered_write_iomap_ops= {
       .iomap_begin        = xfs_buffered_write_iomap_begin,
       .iomap_end           = xfs_buffered_write_iomap_end,
};

xfs_file_aio_write_checks()函数主要是对待写入文件以及写入数据进行检查

xfs_file_aio_write_checks ()函数调用gereratic_write_checks():

inline ssize_t generic_write_checks(structkiocb *iocb, struct iov_iter *from)
{
       structfile *file = iocb->ki_filp;
       structinode *inode = file->f_mapping->host;
       loff_tcount;
       intret;
 
       if(IS_SWAPFILE(inode))
              return-ETXTBSY;
 
       if(!iov_iter_count(from))
              return0;
 
       /*FIXME: this is for backwards compatibility with 2.4 */
       if(iocb->ki_flags & IOCB_APPEND)//追加模式写入
              iocb->ki_pos= i_size_read(inode); //获取当前文件大小，即inode->i_size
 
       if((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags &IOCB_DIRECT))
              return-EINVAL;
 
       count= iov_iter_count(from);
       ret= generic_write_check_limits(file, iocb->ki_pos, &count);//判断被写入的字节数是否已经超过当前文件的最大限制
       if(ret)
              returnret;
 
       iov_iter_truncate(from,count);
       returniov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks)
 
struct iov_iter {
       /*
        * Bit 0 is the read/write bit, set if we'rewriting.
        * Bit 1 is the BVEC_FLAG_NO_REF bit, set iftype is a bvec and
        * the caller isn't expecting to drop a pagereference when done.
        */
       //描述迭代器类型。它是一个bit位，包含读和写，读写取决于数据是读到迭代器还是从迭代器写入。
       //因此，数据处理方向并不是指迭代器本身，而是数据处理的另外一个部分，即被读操作创建的iov_iter将被写入
       unsignedint type;
       size_tiov_offset;//描述的是由该结构体中iov指针指向的第一个iovec结构体中数据的第一个字节的偏移；
       size_tcount;//iovec数组指向的数据个数存放在count中
       union{
              conststruct iovec *iov;
              conststruct kvec *kvec;
              conststruct bio_vec *bvec;
              structpipe_inode_info *pipe;
       };
       union{
              unsignedlong nr_segs;//iovec结构体的个数存放在nr_segs中
              struct{
                     unsignedint head;
                     unsignedint start_head;
              };
       };
};

看下iomap_apply()函数：

ssize_t
iomap_file_buffered_write(struct kiocb*iocb, struct iov_iter *iter,
              conststruct iomap_ops *ops)
{
       structinode *inode = iocb->ki_filp->f_mapping->host;
       loff_tpos = iocb->ki_pos, ret = 0, written = 0;
 
       while(iov_iter_count(iter)) {
              ret= iomap_apply(inode, pos, iov_iter_count(iter),
                            IOMAP_WRITE,ops, iter, iomap_write_actor);//基于页来写文件
              if(ret <= 0)
                     break;
              pos+= ret;
              written+= ret;
       }
 
       returnwritten ? written : ret;
}
 
 
loff_t
iomap_apply(struct inode *inode, loff_t pos,loff_t length, unsigned flags,
              conststruct iomap_ops *ops, void *data, iomap_actor_t actor)
{
       structiomap iomap = { .type = IOMAP_HOLE };
       structiomap srcmap = { .type = IOMAP_HOLE };
       loff_twritten = 0, ret;
       u64end;
 
       trace_iomap_apply(inode,pos, length, flags, ops, actor, _RET_IP_);
 
       /*
        * Need to map a range from start position forlength bytes. This can
        * span multiple pages - it is only guaranteedto return a range of a
        * single type of pages (e.g. all into a hole,all mapped or all
        * unwritten). Failure at this point hasnothing to undo.
        *
        * If allocation is required for this range,reserve the space now so
        * that the allocation is guaranteed to succeedlater on. Once we copy
        * the data into the page cache pages, then wecannot fail otherwise we
        * expose transient stale data. If the reservefails, we can safely
        * back out at this point as there is nothingto undo.
        */
       //调用xfs_buffered_write_iomap_begin获取对应xfs的起始位置
       ret= ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
       if(ret)
              returnret;
       if(WARN_ON(iomap.offset > pos))
              return-EIO;
       if(WARN_ON(iomap.length == 0))
              return-EIO;
 
       end= iomap.offset + iomap.length;
       if(srcmap.type != IOMAP_HOLE)
              end= min(end, srcmap.offset + srcmap.length);
       if(pos + length > end)
              length= end - pos;
//通过函数iomap_write_actor（调用iov_iter_copy_from_user_atomic）将数据从用户态复制到内核态。

       written= actor(inode, pos, length, data, &iomap,
                     srcmap.type!= IOMAP_HOLE ? &srcmap : &iomap);
 
       if(ops->iomap_end) {
              ret= ops->iomap_end(inode, pos, length,
                                 written > 0 ? written : 0,
                                 flags, &iomap);//调用xfs_buffered_write_iomap_end
       }
 
       returnwritten ? written : ret;
}

iomap_begin、iomap_end分别对应的是xfs_buffered_write_iomap_begin和xfs_buffered_write_iomap_end，主要作用是通过iomap映射将文件块映射到文件系统块，而iomap_write_actor则按照page进行写操作.

linux内核VFS之write

CATALOG

FEATURED TAGS