当我们登陆到linux控制台下,执行一个命令或启动一个程序时,系统会生成对应的进程,如果是非常驻进程,则可能执行完命令或程序之后进程就退出了;如果是常驻进程,我们可以通过ps能看到对应的进程信息(例如pid、ppid、status等)。那么系统是如何启动一个程序的呢?我们编译好的二进制文件是如何被启动的呢?启动的时候发生了哪些事情?今天将详细阐述这些未知内容。
首先我们编译一个简单的二进制程序
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char *argv[]) {
for(;;) {
sleep(1);
}
return 0;
}
编译上面的代码,默认生成的二进制文件是a.out
gcc t.c
上面的程序是个死循环,让进程不会主动退出。
编译好之后,我们运行对应的二进制
为了看到对应的系统调用以及后面分析execve,所以我们在执行的时候strace记录下syscall log,
strace -ttT -f -v -s1024 -o s.log ./a.out
看下s.log的部分内容: 再ps看下进程信息:
4 S root 1964 1 0 80 0 - 28232 - Feb04 ? 00:00:00 /usr/sbin/sshd -D
4 S root 826 1964 0 80 0 - 38685 - 10:23 ? 00:00:00 sshd: ansible [priv]
5 S ansible 828 826 0 80 0 - 38685 - 10:23 ? 00:00:00 sshd: ansible@pts/0
0 S ansible 829 828 0 80 0 - 29294 - 10:23 pts/0 00:00:00 -bash
4 S root 903 829 0 80 0 - 60315 - 10:23 pts/0 00:00:00 sudo su
4 S root 904 903 0 80 0 - 47953 - 10:23 pts/0 00:00:00 su
4 S root 905 904 0 80 0 - 29870 - 10:23 pts/0 00:00:00 bash
0 S root 29366 905 0 80 0 - 1194 - 11:16 pts/0 00:00:00 strace -ttT -f -v -s1024 -o s.log ./a.out
0 S root 29372 29366 0 80 0 - 1055 - 11:16 pts/0 00:00:00 ./a.out
其中pid=29372就是./a.out的进程号,即执行二进制a.out,系统生成对应的进程是./a.out,进程号是29372。
从上面可以看出./a.out进程的父进程是bash,而bash的父进程又是systemd(pid=1),所以正常情况下bash进程的优先级是继承systemd的,因此./a.out进程的优先级也同样和systemd进程一样。这里说的正常情况下指的是没有主动更改过进程的优先级。
我们看下pid=1的sched信息(文件/proc/1/sched):
systemd (1, #threads: 1)
-------------------------------------------------------------------
se.exec_start : 1452855565.617914
se.vruntime : 1814115.220358
se.sum_exec_runtime : 318197.286732
se.nr_migrations : 160296
nr_switches : 2762175
nr_voluntary_switches : 2760890
nr_involuntary_switches : 1285
se.load.weight : 1048576
se.runnable_weight : 1048576
se.avg.load_sum : 102
se.avg.runnable_load_sum : 102
se.avg.util_sum : 104448
se.avg.load_avg : 0
se.avg.runnable_load_avg : 0
se.avg.util_avg : 0
se.avg.last_update_time : 1452855565617152
se.avg.util_est.ewma : 10
se.avg.util_est.enqueued : 0
policy : 0
prio : 120
clock-delta : 54
mm->numa_scan_seq : 0
numa_pages_migrated : 0
numa_preferred_nid : -1
total_numa_faults : 0
current_node=0, numa_group_id=0
numa_faults node=0 task_private=0 task_shared=0 group_private=0 group_shared=0
再看下pid=29372的sched信息:
a.out (29372, #threads: 1)
-------------------------------------------------------------------
se.exec_start : 1452938265.169067
se.vruntime : 7168006.445640
se.sum_exec_runtime : 94.083167
se.nr_migrations : 14
nr_switches : 6944
nr_voluntary_switches : 6944
nr_involuntary_switches : 0
se.load.weight : 1048576
se.runnable_weight : 1048576
se.avg.load_sum : 112
se.avg.runnable_load_sum : 112
se.avg.util_sum : 114688
se.avg.load_avg : 0
se.avg.runnable_load_avg : 0
se.avg.util_avg : 0
se.avg.last_update_time : 1452938265168896
se.avg.util_est.ewma : 10
se.avg.util_est.enqueued : 0
policy : 0
prio : 120
clock-delta : 41
mm->numa_scan_seq : 0
numa_pages_migrated : 0
numa_preferred_nid : -1
total_numa_faults : 0
current_node=0, numa_group_id=0
numa_faults node=0 task_private=0 task_shared=0 group_private=0 group_shared=0
从pid=1(systemd)和pid=29372(a.out)的sched信息可以看出,prio都是120,我们再ps看下对应的nice/pri信息:
ps axo pid,ppid,comm,args,cmd,nice,pri --sort=ni
对应systemd和a.out的信息如下:
1 0 systemd /usr/lib/systemd/systemd -- /usr/lib/systemd/systemd -- 0 19
29372 29366 a.out ./a.out ./a.out 0 19
从上面可以看出ps出来的nice以及pri相同,且nice=0。这里说下ps的pri是经过处理的,这里就不详细阐述了,具体可以google。
在之前的文章聊聊进程的priority和nice,已经分析过进程的priority和nice的关系和计算过程。
如果一个 normal process 的 nice (NI) 是 0,那么他在 kernel 角度的 process priority=((nice) + DEFAULT_PRIO) = (0 + (MAX_RT_PRIO+ NICE_WIDTH / 2)) = (0 + 100 + 40 / 2) = 120
从sched文件可以看到a.out和systemd进程的policy都是0,即调度策略都是SCHED_NORMAL
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
所以,systemd进程的nice是0,a.out进程的nice也是0,所以进程a.out的prio=120,也就是我们在/proc/29372/sched看到的prio的值。(其实在系统启动过程还有一个init进程,即进程pid=0,这个是所有进程的父进程,系统启动过程init进程的nice被赋值为0,所以systemd进程的nice也是0)
上面主要是分析进程a.out的prio怎么来的,为什么要分析,主要是了解在linux下系统任何进程最终的父进程都是systemd(不考虑init进程),而且normal process正对于常情况下的nice和prio也是和systemd保持一致。
那么接下来我们看下execve具体做了什么。 execve系统调用定义在内核的fs/exec.c文件中:
SYSCALL_DEFINE3(execve,
const char __user *, filename,
const char __user *const __user *, argv,
const char __user *const __user *, envp)
{
return do_execve(getname(filename), argv, envp);//getname返回struct filename
}
int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };//参数
struct user_arg_ptr envp = { .ptr.native = __envp };//环境变量
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);//AT_FDCWD=-100
}
static int do_execveat_common(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags)
{
return __do_execve_file(fd, filename, argv, envp, flags, NULL);//flags=0,fd=-100
}
__do_execve_file()函数逻辑:
static int __do_execve_file(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags, struct file *file)//flags=0,file=NULL
{
char *pathbuf = NULL;
struct linux_binprm *bprm;
struct files_struct *displaced;
int retval;
if (IS_ERR(filename))//filename大小是否超过4095
return PTR_ERR(filename);
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
* set*uid() to execve() because too many poorly written programs
* don't check setuid() return code. Here we additionally recheck
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) {
retval = -EAGAIN;
goto out_ret;
}
/* We're below the limit (still or again), so we don't want to make
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
retval = unshare_files(&displaced);
if (retval)
goto out_ret;
retval = -ENOMEM;
bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);//为bprm分配内存
if (!bprm)
goto out_files;
retval = prepare_bprm_creds(bprm);//复制父进程的creds给bprm->cred
if (retval)
goto out_free;
check_unsafe_exec(bprm);
current->in_execve = 1;
if (!file)
file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
sched_exec();
bprm->file = file;
if (!filename) {
bprm->filename = "none";
} else if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;//bprm->filename=./a.out
} else {
if (filename->name[0] == '\0')
pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
else
pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
fd, filename->name);
if (!pathbuf) {
retval = -ENOMEM;
goto out_unmark;
}
/*
* Record that a name derived from an O_CLOEXEC fd will be
* inaccessible after exec. Relies on having exclusive access to
* current->files (due to unshare_files above).
*/
if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
bprm->filename = pathbuf;
}
bprm->interp = bprm->filename;//要执行的真实文件名
retval = bprm_mm_init(bprm);//初始化进程内存的相关信息,并分配一个page作为进程的初始堆栈
if (retval)
goto out_unmark;
retval = prepare_arg_pages(bprm, argv, envp);
if (retval < 0)
goto out;
retval = prepare_binprm(bprm);//从bprm->file中读取256字节到bprm->buf中
if (retval < 0)
goto out;
retval = copy_strings_kernel(1, &bprm->filename, bprm);//将程序的文件路径拷贝到堆栈中
if (retval < 0)
goto out;
bprm->exec = bprm->p;
retval = copy_strings(bprm->envc, envp, bprm);//将环境变量拷贝到堆栈中
if (retval < 0)
goto out;
retval = copy_strings(bprm->argc, argv, bprm);//将程序参数拷贝到堆栈中
if (retval < 0)
goto out;
would_dump(bprm, bprm->file);
retval = exec_binprm(bprm);//调用exec_binprm方法执行程序
if (retval < 0)
goto out;
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
free_bprm(bprm);
kfree(pathbuf);
if (filename)
putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;
out:
if (bprm->mm) {
acct_arg_size(bprm, 0);
mmput(bprm->mm);
}
out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;
out_free:
free_bprm(bprm);
kfree(pathbuf);
out_files:
if (displaced)
reset_files_struct(displaced);
out_ret:
if (filename)
putname(filename);
return retval;
}
int unshare_files(struct files_struct **displaced)
{
struct task_struct *task = current;
struct files_struct *copy = NULL;
int error;
error = unshare_fd(CLONE_FILES, ©);//复制files_struct
if (error || !copy) {
*displaced = NULL;
return error;
}
*displaced = task->files;
task_lock(task);
task->files = copy;
task_unlock(task);
return 0;
}
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {//fd不为空且共享该files_struct的进程数大于1,则复制files_struct
*new_fdp = dup_fd(fd, &error);
if (!*new_fdp)
return error;
}
return 0;
}
linux_binprm数据结构如下:
struct linux_binprm {
#ifdef CONFIG_MMU
struct vm_area_struct *vma;
unsigned long vma_pages;
#else
# define MAX_ARG_PAGES 32
struct page *page[MAX_ARG_PAGES];
#endif
struct mm_struct *mm;
unsigned long p; /* current top of mem */
unsigned long argmin; /* rlimit marker for copy_strings() */
unsigned int
/*
* True after the bprm_set_creds hook has been called once
* (multiple calls can be made via prepare_binprm() for
* binfmt_script/misc).
*/
called_set_creds:1,
/*
* True if most recent call to the commoncaps bprm_set_creds
* hook (due to multiple prepare_binprm() calls from the
* binfmt_script/misc handlers) resulted in elevated
* privileges.
*/
cap_elevated:1,
/*
* Set by bprm_set_creds hook to indicate a privilege-gaining
* exec has happened. Used to sanitize execution environment
* and to set AT_SECURE auxv for glibc.
*/
secureexec:1;
#ifdef __alpha__
unsigned int taso:1;
#endif
unsigned int recursion_depth; /* only for search_binary_handler() */
struct file * file; //要执行的文件
struct cred *cred; /* new credentials */
int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
unsigned int per_clear; /* bits to clear in current->personality */
int argc, envc; //参数和环境变量数
const char * filename; /* Name of binary as seen by procps 要执行的文件名*/
const char * interp; /* Name of the binary really executed. Most
of the time same as filename, but could be
different for binfmt_{misc,script} *///要执行的文件的真实名称,通常和filename相同
unsigned interp_flags;
unsigned interp_data;
unsigned long loader, exec;
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
char buf[BINPRM_BUF_SIZE];
} __randomize_layout;
bprm_mm_init()初始化进程内存的相关信息,并分配一个page作为进程的初始堆栈
static int bprm_mm_init(struct linux_binprm *bprm)
{
int err;
struct mm_struct *mm = NULL;
bprm->mm = mm = mm_alloc();//分配内存
err = -ENOMEM;
if (!mm)
goto err;
/* Save current stack limit for all calculations made during exec. */
task_lock(current->group_leader);
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
task_unlock(current->group_leader);
err = __bprm_mm_init(bprm);
if (err)
goto err;
return 0;
err:
if (mm) {
bprm->mm = NULL;
mmdrop(mm);
}
return err;
}
struct mm_struct *mm_alloc(void)
{
struct mm_struct *mm;
mm = allocate_mm();
if (!mm)
return NULL;
memset(mm, 0, sizeof(*mm));
return mm_init(mm, current, current_user_ns());
}
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
if (mm_alloc_pgd(mm))//初始化mm
goto fail_nopgd;
if (init_new_context(p, mm))
goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns);
return mm;
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
free_mm(mm);
return NULL;
}
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
if (unlikely(!mm->pgd))
return -ENOMEM;
return 0;
}
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret, *init;
ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER);
if (ret) {
init = pgd_offset(&init_mm, 0UL);
pgd_init((unsigned long)ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
}
return ret;
}
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
#define page_address(page) lowmem_page_address(page)
static __always_inline void *lowmem_page_address(const struct page *page)
{
return page_to_virt(page);
}
#define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x)))
#define page_to_pfn(page) ((unsigned long)(page) / PAGE_SIZE)
#define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT
#define __va(x) ((void *)__phys_to_virt((unsigned long)(x)))
#define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET)
mm_alloc_pgd() 函数会调用pgd_alloc()会为该进程分配一页(4K)的页全局目录的线性地址并保存在 task_struct->mm_struct->pgd中
具体的实现是通过__get_free_pages((gfp_mask), 0)实现的,该函数通过alloc_pages()在低端内存里( 小于896M的空间里)分配一个页描述符(struct page *page),并将该页的页描述符通过page_address()转换成虚拟地址。实际上就是通过__va(PFN_PHYS(page_to_pfn(page)))先将页描述符转换成实际物理地址((page - mem_map) « 12 )(所有的物理页描述符存放在mem_map数组里,左移12是一页4K的大小),然后再将物理地址通过__va转换成虚拟地址,也即将得到的低端物理内存地址减去PHYS_OFFSET加上PAGE_OFFSET即可 (unsigned long )(x)–PHYS_OFFSET+PAGE_OFFSET
接着调用exec_binprm
static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
int ret;
/* Need to fetch pid before load_binary changes it */
old_pid = current->pid;
rcu_read_lock();
old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
rcu_read_unlock();
ret = search_binary_handler(bprm);
...
return ret;
}
//exec_binprm又调用了search_binary_handler方法
//search_binary_handler()函数遍历linux中可识别的可执行文件格式,找到对应的文件格式,并调用其load_binary方法。
int search_binary_handler(struct linux_binprm *bprm)
{
struct linux_binfmt *fmt;
list_for_each_entry(fmt, &formats, lh) {
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
bprm->recursion_depth++;
retval = fmt->load_binary(bprm);
bprm->recursion_depth--;
read_lock(&binfmt_lock);
put_binfmt(fmt);
}
read_unlock(&binfmt_lock);
return retval;
}
由于linux下可执行文件的格式一般为elf,所以我们直接看其load_binary()函数,其定义在fs/binfmt_elf.c文件中
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
...
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
...
unsigned long elf_entry;
...
struct {
struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
} *loc;
...
loc = kmalloc(sizeof(*loc), GFP_KERNEL);
...
// 将之前从file中读出来的buf的内容,转成elf的header
loc->elf_ex = *((struct elfhdr *)bprm->buf);
...
// 从程序文件中读取elf的program header
elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);
...
// 遍历program header,找到其中的interpreter
elf_ppnt = elf_phdata;
for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
char *elf_interpreter;
loff_t pos;
if (elf_ppnt->p_type != PT_INTERP)
continue;
...
elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
...
pos = elf_ppnt->p_offset;
// 从程序文件中读取interpreter的路径,一般为 /lib64/ld-linux-x86-64.so.2
// 有关interpreter的信息,请看 http://man7.org/linux/man-pages/man8/ld.so.8.html
retval = kernel_read(bprm->file, elf_interpreter,
elf_ppnt->p_filesz, &pos);
...
// 打开interpreter文件
interpreter = open_exec(elf_interpreter);
...
pos = 0;
// 读取interpreter的elf header
retval = kernel_read(interpreter, &loc->interp_elf_ex,
sizeof(loc->interp_elf_ex), &pos);
...
break;
...
}
...
// 关闭当前进程使用的资源,比如线程、内存、文件等
retval = flush_old_exec(bprm);
...
// 设置新程序的各种信息
setup_new_exec(bprm);
...
// 重新设置当前堆栈的位置及大小
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
...
// 初始化
elf_bss = 0;
elf_brk = 0;
start_code = ~0UL;
end_code = 0;
start_data = 0;
end_data = 0;
// 遍历program header,将程序文件中的代码段、data段等映射到内存
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
...
if (elf_ppnt->p_type != PT_LOAD)
continue;
...
vaddr = elf_ppnt->p_vaddr;
...
// 映射程序代码等信息到内存的虚拟地址,类似于mmap系统调用
//该操作会在进程的struct mm_struct实例中添加一个struct vm_area_struct实例
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
...
// 设置程序的各个segment位置信息
k = elf_ppnt->p_vaddr;
if (k < start_code)
start_code = k;
if (start_data < k)
start_data = k;
...
k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
if (k > elf_bss)
elf_bss = k;
if ((elf_ppnt->p_flags & PF_X) && end_code < k)
end_code = k;
if (end_data < k)
end_data = k;
k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
if (k > elf_brk) {
bss_prot = elf_prot;
elf_brk = k;
}
}
// 调整程序各个segment的具体位置
loc->elf_ex.e_entry += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
end_code += load_bias;
start_data += load_bias;
end_data += load_bias;
...
// 设置堆的地址
retval = set_brk(elf_bss, elf_brk, bss_prot);
...
if (interpreter) {
...
// 加载interpreter的入口地址
elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
&interp_map_addr,
load_bias, interp_elf_phdata);
if (!IS_ERR((void *)elf_entry)) {
...
interp_load_addr = elf_entry;
elf_entry += loc->interp_elf_ex.e_entry;
}
...
} else {
// 如果该程序没有interpreter,则使用程序自己的入口地址
elf_entry = loc->elf_ex.e_entry;
...
}
...
// 设置堆栈的各种信息
retval = create_elf_tables(bprm, &loc->elf_ex,
load_addr, interp_load_addr);
...
// 设置程序segment的地址
current->mm->end_code = end_code;
current->mm->start_code = start_code;
current->mm->start_data = start_data;
current->mm->end_data = end_data;
current->mm->start_stack = bprm->p;
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
...
current->mm->brk = current->mm->start_brk =
arch_randomize_brk(current->mm);
...
}
...
// 开始执行elf_entry指向的代码
// 如果该程序有interpreter,则是执行interpreter中的入口地址
// 如果没有,则是执行程序自己的入口地址
// interpreter会检查该程序依赖的动态链接库,加载这些库,并解析相应的函数地址
// 之后再调用源程序自己的入口函数
start_thread(regs, elf_entry, bprm->p);
...
return retval;
...
}
以上就是在linux下执行一个程序(二进制)时,内核发生的一些事情。核心主要有以下几点:
1、init进程的nice=0,systemd的nice也是0
2、对于调度策略是SCHED_NORMAL的进程,如果其nice=0,则其子进程的nice也是0
3、通过fork生成新进程,并通过execve系统调用来运行指定的应用程序
本文另外一方面就是阐述执行程序时,程序的线性地址是怎么产生的。