上一节我们通过对x86的linux内核的讲解,知道了它的一个大概的启动过程。
/arch/x86/boot/header.S -> calll main -> /arch/x86/boot/main.c -> go_to_protected_mode() -> /arch/x86/boot/pm.c -> protected_mode_jump() -> /arch/x86/boot/pmjump.S -> jmpl *%eax -> /arch/x86/kernel/head_32.S -> .long i386_start_kernel -> /arch/x86/kernel/head32.c -> start_kernel() -> /init/main.c (C语言入口)
这其中的动作,基本都是找到对应的地址,然后设置各种设备的初始化信息,中断设置,键盘,控制台,idt...
当然,有相当一部分代码是用汇编语言完成的,这自然是底层硬件决定的,而且因为特殊性,再封装是没有必要的了。所以,汇编是最好的选择。
本篇,我们再来看看cpu架构无关的main都又干了啥,从而解开心中的迷团。
排除掉架构相关的代码,就是到了/init/main.c 中的 start_kernel(), 从这里我们可以看到操作系统启动时,大致干了啥。
// /init/main.c asmlinkage __visible void __init start_kernel(void) { char *command_line; char *after_dashes; set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); cgroup_init_early(); local_irq_disable(); early_boot_irqs_disabled = true; /* * Interrupts are still disabled. Do necessary setups, then * enable them. */ boot_cpu_init(); page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); /* * Set up the the initial canary and entropy after arch * and after adding latent and command line entropy. */ add_latent_entropy(); add_device_randomness(command_line, strlen(command_line)); boot_init_stack_canary(); mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ boot_cpu_hotplug_init(); build_all_zonelists(NULL); page_alloc_init(); pr_notice("Kernel command line: %s\n", boot_command_line); parse_early_param(); after_dashes = parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, -1, -1, NULL, &unknown_bootoption); if (!IS_ERR_OR_NULL(after_dashes)) parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, NULL, set_init_arg); jump_label_init(); /* * These use large bootmem allocations and must precede * kmem_cache_init() */ setup_log_buf(0); vfs_caches_init_early(); sort_main_extable(); trap_init(); mm_init(); ftrace_init(); /* trace_printk can be enabled here */ early_trace_init(); /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. */ sched_init(); /* * Disable preemption - early bootup scheduling is extremely * fragile until we cpu_idle() for the first time. */ preempt_disable(); if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound * workqueue to take non-housekeeping into account. */ housekeeping_init(); /* * Allow workqueue creation and work item queueing/cancelling * early. Work item execution depends on kthreads and starts after * workqueue_init(). */ workqueue_init_early(); rcu_init(); /* Trace events are available after this */ trace_init(); if (initcall_debug) initcall_debug_enable(); context_tracking_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); tick_init(); rcu_init_nohz(); init_timers(); hrtimers_init(); softirq_init(); timekeeping_init(); time_init(); sched_clock_postinit(); printk_safe_init(); perf_event_init(); profile_init(); call_function_init(); WARN(!irqs_disabled(), "Interrupts were enabled early\n"); early_boot_irqs_disabled = false; local_irq_enable(); kmem_cache_init_late(); /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ console_init(); if (panic_later) panic("Too many boot %s vars at `%s'", panic_later, panic_param); lockdep_info(); /* * Need to run this when irqs are enabled, because it wants * to self-test [hard/soft]-irqs on/off lock inversion bugs * too: */ locking_selftest(); /* * This needs to be called before any devices perform DMA * operations that might use the SWIOTLB bounce buffers. It will * mark the bounce buffers as decrypted so that their usage will * not cause "plain-text" data to be decrypted when accessed. */ mem_encrypt_init(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", page_to_pfn(virt_to_page((void *)initrd_start)), min_low_pfn); initrd_start = 0; } #endif page_ext_init(); kmemleak_init(); debug_objects_mem_init(); setup_per_cpu_pageset(); numa_policy_init(); acpi_early_init(); if (late_time_init) late_time_init(); calibrate_delay(); pid_idr_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); #endif thread_stack_cache_init(); cred_init(); fork_init(); proc_caches_init(); uts_ns_init(); buffer_init(); key_init(); security_init(); dbg_late_init(); vfs_caches_init(); pagecache_init(); signals_init(); seq_file_init(); proc_root_init(); nsfs_init(); cpuset_init(); cgroup_init(); taskstats_init_early(); delayacct_init(); check_bugs(); acpi_subsystem_init(); arch_post_acpi_subsys_init(); sfi_init_late(); if (efi_enabled(EFI_RUNTIME_SERVICES)) { efi_free_boot_services(); } // 执行除了各种init之外的代码,就是创建首个线程之类的 /* Do the rest non-__init'ed, we're now alive */ rest_init(); } /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to * be reaped by free_initmem before the root thread has proceeded to * cpu_idle. * * gcc-3.4 accidentally inlines this function, so use noinline. */ static __initdata DECLARE_COMPLETION(kthreadd_done); // main.c static noinline void __ref rest_init(void) { struct task_struct *tsk; int pid; rcu_scheduler_starting(); /* * We need to spawn init first so that it obtains pid 1, however * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ // 首先创建init进程,此进程pid=1 pid = kernel_thread(kernel_init, NULL, CLONE_FS); /* * Pin init on the boot CPU. Task migration is not properly working * until sched_init_smp() has been run. It will set the allowed * CPUs for init to the non isolated CPUs. */ rcu_read_lock(); tsk = find_task_by_pid_ns(pid, &init_pid_ns); set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); rcu_read_unlock(); numa_default_policy(); // 然后创建 kthreadd 进程,此进程pid=2 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); /* * Enable might_sleep() and smp_processor_id() checks. * They cannot be enabled earlier because with CONFIG_PREEMPT=y * kernel_thread() would trigger might_sleep() splats. With * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled * already, but it's stuck on the kthreadd_done completion. */ system_state = SYSTEM_SCHEDULING; complete(&kthreadd_done); /* * The boot idle thread must execute schedule() * at least once to get things moving: */ schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ // idle 进程开启 cpu_startup_entry(CPUHP_ONLINE); }
同样,有大量的设备的init操作。但 rest_init() 稍微不太一样点,至少它和硬件关系不那么大了。它主要干三大件事:1. 初始化init进程; 2. 初始化kthreadd进程; 3. 初始化idle进程. 这三个东西,也许更值得多探探究竟。因为毕竟,硬件我们还是在外行了。
init进程,又叫第一个进程,即pid为1的进程,是系统必不可少的进程。那它都干了啥呢?我们来看一下:
// main.c // 初始化进程,主要用于执行 /bin/init 等启动命令 static int __ref kernel_init(void *unused) { int ret; // 初始化系统模块,开启用户空间 kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); ftrace_free_init_mem(); jump_label_invalidate_initmem(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; numa_default_policy(); rcu_end_inkernel_boot(); if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) return 0; pr_err("Failed to execute %s (error %d)\n", ramdisk_execute_command, ret); } /* * We try each of these until one succeeds. * * The Bourne shell can be used instead of init if we are * trying to recover a really broken machine. */ if (execute_command) { ret = run_init_process(execute_command); if (!ret) return 0; panic("Requested init %s failed (error %d).", execute_command, ret); } // 执行以下init系统命令,以便将系统运行起来 // 因各平台各配置不一致,故做多次尝试,但只要一次成功,则返回0 if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || !try_to_run_init_process("/bin/sh")) return 0; panic("No working init found. Try passing init= option to kernel. " "See Linux Documentation/admin-guide/init.rst for guidance."); } // /init/main.c static noinline void __init kernel_init_freeable(void) { /* * Wait until kthreadd is all set-up. */ wait_for_completion(&kthreadd_done); /* Now the scheduler is fully set up and can do blocking allocations */ gfp_allowed_mask = __GFP_BITS_MASK; /* * init can allocate pages on any node */ set_mems_allowed(node_states[N_MEMORY]); cad_pid = task_pid(current); smp_prepare_cpus(setup_max_cpus); // 将队列绑定到各cpu上,以便后续可以各自执行各自的任务 workqueue_init(); init_mm_internals(); do_pre_smp_initcalls(); lockup_detector_init(); smp_init(); sched_init_smp(); page_alloc_init_late(); // cpu已就绪,可以进行真正的初始化方法了 do_basic_setup(); /* Open the /dev/console on the rootfs, this should never fail */ if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) pr_err("Warning: unable to open an initial console.\n"); (void) ksys_dup(0); (void) ksys_dup(0); /* * check if there is an early userspace init. If yes, let it do all * the work */ if (!ramdisk_execute_command) ramdisk_execute_command = "/init"; if (ksys_access((const char __user *) ramdisk_execute_command, 0) != 0) { ramdisk_execute_command = NULL; prepare_namespace(); } /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. * * rootfs is available now, try loading the public keys * and default modules */ integrity_load_keys(); // 加载默认模块 load_default_modules(); } /* * Ok, the machine is now initialized. None of the devices * have been touched yet, but the CPU subsystem is up and * running, and memory and process management works. * * Now we can finally start doing some real work.. */ static void __init do_basic_setup(void) { cpuset_init_smp(); shmem_init(); driver_init(); init_irq_proc(); do_ctors(); usermodehelper_enable(); do_initcalls(); } // /drivers/base/init.c 驱动初始化 /** * driver_init - initialize driver model. * * Call the driver model init functions to initialize their * subsystems. Called early from init/main.c. */ void __init driver_init(void) { /* These are the core pieces */ devtmpfs_init(); devices_init(); buses_init(); classes_init(); firmware_init(); hypervisor_init(); /* These are also core pieces, but must come after the * core core pieces. */ platform_bus_init(); cpu_dev_init(); memory_dev_init(); container_dev_init(); of_core_init(); } // /init/main.c /* * This function requests modules which should be loaded by default and is * called twice right after initrd is mounted and right before init is * exec'd. If such modules are on either initrd or rootfs, they will be * loaded before control is passed to userland. */ void __init load_default_modules(void) { load_default_elevator_module(); } // /block/elevator.c /* called during boot to load the elevator chosen by the elevator param */ void __init load_default_elevator_module(void) { struct elevator_type *e; if (!chosen_elevator[0]) return; /* * Boot parameter is deprecated, we haven't supported that for MQ. * Only look for non-mq schedulers from here. */ spin_lock(&elv_list_lock); e = elevator_find(chosen_elevator, false); spin_unlock(&elv_list_lock); if (!e) request_module("%s-iosched", chosen_elevator); }
可以看到,init进程承担着非常重要的工作,它需要初始化内存,页,队列,cpu等等,还要创建用户空间,加载默认模块等等。并且更重要的是,它要负责执行开机启动程序,而这决定了我们的系统如何运行。它如此重要以至于,它作为第一个进程被创建出来。是一个不可少的进程。
继init进程之后,kthreadd是第二个运行的进程,它又是在干什么呢?实际上,它主要用于给各子进程创建时使用的。
// /include/linux/kthread.h int kthreadd(void *unused) { struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ // 让kthreadd进程尽量少各种特殊配置,以便各子进程生成时,会带有各种特异功能 set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) // 上下文切换,即主动放弃cpu,此处是汇编实现 schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); // 创建一个内核线程(进程) create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } // /kernel/kthread.c 创建一个内核线程(进程) static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* If user was SIGKILLed, I release the structure. */ struct completion *done = xchg(&create->done, NULL); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } }
可见 kthreadd 的作用就是不停地根据需要,创建一个个的内核进程线程咯。
idle进程是在启动后做的一件事。它的作用就是,不停的运行,保持cpu的活性。
// kernel/sched/idle.c void cpu_startup_entry(enum cpuhp_state state) { /* * This #ifdef needs to die, but it's too late in the cycle to * make this generic (ARM and SH have never invoked the canary * init for the non boot CPUs!). Will be fixed in 3.11 */ #ifdef CONFIG_X86 /* * If we're the non-boot CPU, nothing set the stack canary up * for us. The boot CPU already has it initialized but no harm * in doing it again. This is a good place for updating it, as * we wont ever return from this function (so the invalid * canaries already on the stack wont ever trigger). */ boot_init_stack_canary(); #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); // 永不停止的 do_idle while (1) do_idle(); } /* * Generic idle loop implementation * * Called with polling cleared. */ static void do_idle(void) { int cpu = smp_processor_id(); /* * If the arch has a polling bit, we maintain an invariant: * * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != * rq->idle). This means that, if rq->idle has the polling bit set, * then setting need_resched is guaranteed to cause the CPU to * reschedule. */ __current_set_polling(); tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) { tick_nohz_idle_stop_tick_protected(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } local_irq_disable(); arch_cpu_idle_enter(); /* * In poll mode we reenable interrupts and spin. Also if we * detected in the wakeup from idle path that the tick * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ if (cpu_idle_force_poll || tick_check_broadcast_expired()) { tick_nohz_idle_restart_tick(); // 轮循 idle cpu_idle_poll(); } else { cpuidle_idle_call(); } arch_cpu_idle_exit(); } /* * Since we fell out of the loop above, we know TIF_NEED_RESCHED must * be set, propagate it into PREEMPT_NEED_RESCHED. * * This is required because for polling idle loops we will not have had * an IPI to fold the state for us. */ preempt_set_need_resched(); tick_nohz_idle_exit(); __current_clr_polling(); /* * We promise to call sched_ttwu_pending() and reschedule if * need_resched() is set while polling is set. That means that clearing * polling needs to be visible before doing these things. */ smp_mb__after_atomic(); sched_ttwu_pending(); schedule_idle(); if (unlikely(klp_patch_pending(current))) klp_update_patch_state(current); } static noinline int __cpuidle cpu_idle_poll(void) { rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); stop_critical_timings(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); return 1; } // arch/sh/include/asm/processor.h #define cpu_relax() barrier() // arch/powerpc/boot/io.h static inline void barrier(void) { asm volatile("" : : : "memory"); }
idle 进程就是不停地运行检测,然后调用cpu命令进行休眠。
当然了,在有的精简系统中,idle进程并非是必须的,但其思想却是值得一学的。