struct task_struct { // 进程状态 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; // 指向内核栈指针 atomic_t usage; // 有几个进程使用此结构 unsigned int flags; /* per process flags, defined below */ unsigned int ptrace; // ptrace系统调用 实现断点调试 跟踪进程运行 // 条件编译多处理器用到 #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; int wake_cpu; #endif // 运行队列和进程调试相关程序 int on_rq; int prio, static_prio, normal_prio; //关于进程调试 unsigned int rt_priority; //优先级 // 关于进程 const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; // 结构体链表 #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* list of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif // 块设备I/O的跟踪工具 #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif // 进程调试策略相关的字段 unsigned int policy; int nr_cpus_allowed; cpumask_t cpus_allowed; // RCU同步原语 #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; bool rcu_tasks_holdout; struct list_head rcu_tasks_holdout_list; int rcu_tasks_idle_cpu; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_SCHED_INFO struct sched_info sched_info; #endif //进程链表架构 struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; #endif // 进程管理地址空间,每个进程有独立的地址空间4G struct mm_struct *mm, *active_mm; /* per-thread vma caching */ u32 vmacache_seqnum; struct vm_area_struct *vmacache[VMACACHE_SIZE]; #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; #endif // 进程状态参数 /* task state */ int exit_state; int exit_code, exit_signal; // 接收父进程终止时就会发出信号 int pdeath_signal; /* The signal sent when the parent dies */ unsigned long jobctl; /* JOBCTL_*, siglock protected */ /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; /* scheduler bits, serialized by scheduler locks */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned :0; /* force alignment to the next boundary */ /* unserialized, strictly 'current' */ unsigned in_execve:1; /* bit to tell LSMs we're in execve */ unsigned in_iowait:1; #ifdef CONFIG_MEMCG unsigned memcg_may_oom:1; #endif #ifdef CONFIG_MEMCG_KMEM unsigned memcg_kmem_skip_account:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif unsigned long atomic_flags; /* Flags needing atomic access. */ struct restart_block restart_block; // 进程pid, 父进程tgid pid_t pid; pid_t tgid; // 防止内核堆栈溢出 #ifdef CONFIG_CC_STACKPROTECTOR /* Canary value for the -fstack-protector gcc feature */ unsigned long stack_canary; #endif /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ // 初始化父进程 struct task_struct __rcu *real_parent; /* real parent process */ // 接收中止进程 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ /* * children/sibling forms the list of my natural children */ // 维护子进程链表 struct list_head children; /* list of my children */ // 兄弟进程链表 struct list_head sibling; /* linkage in my parent's children list */ struct task_struct *group_leader; /* threadgroup leader */ /* * ptraced is the list of tasks this task is using ptrace on. * This includes both natural children and PTRACE_ATTACH targets. * p->ptrace_entry is p's link on the p->parent->ptraced list. */ // 系统调用 关于断开调试 struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ // 散列表的关系 struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; // do_fork()函数 struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ // 描述CPU时间的内容 // utime 用户态下的执行时间 // stime 内核态的执行时间 cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_t vtime_seqlock; unsigned long long vtime_snap; enum { VTIME_SLEEPING = 0, VTIME_USER, VTIME_SYS, } vtime_snap_whence; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ u64 real_start_time; /* boot based time in nsec */ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; struct task_cputime cputime_expires; struct list_head cpu_timers[3]; /* process credentials */ const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ const struct cred __rcu *cred; /* effective (overridable) subjective task * credentials (COW) */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) - initialized normally by setup_new_exec */ /* file system info */ struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ unsigned long last_switch_count; #endif /* filesystem information */ struct fs_struct *fs; /* open file information */ struct files_struct *files; /* namespaces */ struct nsproxy *nsproxy; /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; sigset_t blocked, real_blocked; sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; struct callback_head *task_works; struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, * mempolicy */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ struct rb_root pi_waiters; struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; unsigned long hardirq_enable_ip; unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; int hardirqs_enabled; int hardirq_context; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; unsigned int softirq_disable_event; unsigned int softirq_enable_event; int softirqs_enabled; int softirq_context; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; gfp_t lockdep_reclaim_gfp; #endif /* journalling filesystem info */ void *journal_info; /* stacked block device info */ struct bio_list *bio_list; #ifdef CONFIG_BLOCK /* stack plugging */ struct blk_plug *plug; #endif /* VM state */ // 虚拟内存状态参数 内存回收 struct reclaim_state *reclaim_state; // 存放块设备I/O流量信息 struct backing_dev_info *backing_dev_info; // I/O调度器所用的信息 struct io_context *io_context; unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ struct task_io_accounting ioac; #if defined(CONFIG_TASK_XACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ cputime_t acct_timexpd; /* stime + utime since last update */ #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock */ struct list_head cg_list; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif // 内存检测工具 #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; struct list_head numa_entry; struct numa_group *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH struct tlbflush_unmap_batch tlb_ubc; #endif struct rcu_head rcu; /* * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; //管道 struct page_frag task_frag; // 延迟计数 #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif
内核使用了写时复制(Copy-On-Write,COW)技术,以防止在fork执行时将父进程的所有数据 复制到子进程。在调用fork时,内核通常对父进程的每个内存页,都为子进程创建一个相同的副本。
问题:主进程修改页z的数据,此时会发生父子进程在内存分离。
只有在不得不复制数据内容时采取复制数据内容,这就是写时复制的核心思想,可以看到因为修改页z导致子进程不得不去复制原来页z来保证父子进程互不干扰。
内核只为新生成的子进程创建虚拟空间结构,它们来复制父进程的虚拟结构,但是不为这些段分配物理内存,它们共享父进程的物理空间,父进程有更改相应段的行为发生时,再为子进程相应段分配物理空间。
内核线程是直接由内核本身启动的进程。内核线程实际上是将内核函数委托给独立的 进程,与系统中其他进程“并行”执行(实际上,也并行于内核自身的执行)。内核线程经常 称之为(内核)守护进程。它们用于执行下列任务。
进程必须用exit系统调用终止。这使得内核有机会将该进程使用的资源释放回系统。见kernel/exit.c------>do_exit。简而言之, 该函数的实现就是将各个引用计数器减1,如果引用计数器归0而没有进程再使用对应的结构,那么将相应的内存区域返还给内存管理模块
void do_exit(long code) { struct task_struct *tsk = current; int group_dead; TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); /* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before * continuing. Amongst other possible reasons, this is to prevent * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ set_fs(USER_DS); ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state * cleanup has been done or not. In the worst case it * loops once more. We pretend that the cleanup was * done as there is no way to return. Either the * OWNER_DIED bit is set by now or we push the blocked * task into the wait for ever nirwana as well. */ tsk->flags |= PF_EXITPIDONE; set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); TASKS_RCU(preempt_disable()); TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done * or not. In the worst case it loops once more. */ tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); /* * The setting of TASK_RUNNING by try_to_wake_up() may be delayed * when the following two conditions become true. * - There is race condition of mmap_sem (It is acquired by * exit_mm()), and * - SMI occurs before setting TASK_RUNINNG. * (or hypervisor of virtual machine switches to other guest) * As a result, we may become TASK_RUNNING after becoming TASK_DEAD * * To avoid it, we have to wait for releasing tsk->pi_lock which * is held by try_to_wake_up() */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ schedule(); BUG(); /* Avoid "noreturn function does return". */ for (;;) cpu_relax(); /* For when BUG is null */ }
本文介绍了进程四要素、进程生命周期,列举了task_struct数据结构的主要内容,系统调用写时复制思想,守护进程,进程退出原理等内容。