the task_struct struct

| 标签 linux 

0x00 learning with kernel

据说学习内核的方法有很多.
有人说可以把操作系统代码当成一坨内存里面东东, 然后由中断不断的来叫他们起来干活.
有人说也可以从 CPU 角度看, 毕竟代码总是要被 CPU 加载执行.
也有人说可以用进程入手, 进程是运行的实体, 像线索一样联系着 kernel 的各个部分.
我就准备先从进程入手, 去窥视一下内核的复杂度.

0x02 Simple linux kernel architectures

kernel architectures

简单的内核架构如上图, 就是系统调用层之下的部分.

0x10 process descriptor overview

task_strcut 相关的信息 1 放在 include/linux/sched.h 中.
不过内核调度的实体的线程, 所以用户态的每一个线程都对应着一个 task_struct.
从内核角度来看不过是一些共享一部分资源的 task_struct, 由 clone 时候的部分参数标志决定.

clone(CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND, 0);

剔除了一些大量无关的条件编译.(smp, btrace, nmua…), 还有一部分看不懂的的条件编译, 源码如下.
(虽然说看不懂选项, 但是这些条件编译的信息在你 make menuconfig, 编译选项 help 中都有说明)

struct task_struct {
/* 内核提供 set_task_state 与 set_current_state 宏来修改当前进程状态 */
volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;			/* 进程的内核栈 */
atomic_t usage;
unsigned int flags;	/* per process flags, defined below */
unsigned int ptrace; /* 调试相关 */

/* 这个部分是与进程调度相关的, 优先级, 调度策略 */
int on_rq; /* 运行队列 */

int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;

/*
这也是和调度相关的.
linux kernel 提供 sched_getaffinity() 与 sched_setafffinity()
系统调用来 cpu 进程亲和力掩码.
*/
unsigned int policy;

cpumask_t cpus_allowed;

/*
进程的链表, 把所有进程用双向循环链表链起来.
include/linux/list.h
*/
sturct list_head {
struct list_head *next, *prev;
}
struct list_head tasks;

/*
进程的地址空间, 内存的代码段数据段什么的.
IA32 每个进程 2^32(4G)VMM.
*/
struct mm_struct *mm, *active_mm;

/* 任务的状态 */
int exit_state;
int exit_code, exit_signal;		/* 退出代码, 退出信号 */
int pdeath_signal;				/*  当父进程死发送的信号  */
unsigned int jobctl;			/* JOBCTL_*, siglock protected */
/* ??? */
unsigned int personality;
unsigned did_exec:1;
unsigned in_execve:1;	/* Tell the LSMs that the process is doing an execve */
unsigned in_iowait:1;


/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;

/* pid 进程 id, 默认最大值 PID_MAX_DEFAULT-1, 超过循环使用
* tgid(thread group id),getpid() 返回的是 tgid.
*/
pid_t pid;
pid_t tgid;	

/* 
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively.  (p->father can be replaced with 
* p->real_parent->pid)
*/
/*
进程的父子关系, 如果一个进程创建多个子进程, 则子进程具有兄弟关系.
说明参考 ULK 图 3-4
*/
struct task_struct *real_parent; /* 指向创建当前进程的父亲, 如果父亲不存在指向 1(init) */
struct task_struct *parent; /* 接收 SIGCHLD, wait4() reports, 一般等于 real_parent, 除非有进程发起 ptrace() */
/*
* children/sibling forms the list of my natural children
*/
struct list_head children;	/* 链表里面所有进程都是 current 的子进程 */
struct list_head sibling;	/* 是 current 的子进程的兄弟之间的链表包含 current */
struct task_struct *group_leader;	/* 指向 current 所在进程组的指针 */

/*
* ptraced is the list of tasks this task is using ptrace on.
* This includes both natural children and PTRACE_ATTACH targets.
* p->ptrace_entry is p's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;

/* PID/PID hash table linkage. 
利用 hash 数据结构特性 (search, insert, delete).
*/
struct pid_link pids[PIDTYPE_MAX];

/*
线程组链表
*/
struct list_head thread_group;

struct completion *vfork_done;		/* for vfork() */
int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */

/*
进程自己的一些时间统计
*/
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;

unsigned long nvcsw, nivcsw; 		/* context switch counts */
struct timespec start_time; 		/* monotonic time */
struct timespec real_start_time;	/* boot based time */

/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

struct task_cputime cputime_expires;
struct list_head cpu_timers[3];

/* 符号链接数 */
int link_count, total_link_count;

/* 当前进程的 cpu 状态, 在进程上下文切换起着关键作业. */
struct thread_struct thread;

/* filesystem information */
struct fs_struct *fs;

/* 打开的文件描述符列表 */
struct files_struct *files;

/* namespaces */
struct nsproxy *nsproxy;

/* 信号处理 */
struct signal_struct *signal;	/* 信号名称 */
struct sighand_struct *sighand; /* 处理函数 */

sigset_t blocked, real_blocked;
sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
struct sigpending pending;

unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct audit_context *audit_context;

/*
securtiy computing
*/
seccomp_t seccomp;

/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;

/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy */
/* 自旋锁类型, 内核同步保证线程安全 */
spinlock_t alloc_lock;

/* 
Protection of the PI data structures: 
基于 PI 协议的等待互斥锁,其中 PI 指的是 priority inheritance(优先级继承) 
*/
raw_spinlock_t pi_lock;

/* 
文件系统日志相关 
*/
void *journal_info;

/* 
stacked block device info 
块设备列表
*/
struct bio_list *bio_list;

/*
VM state, 内存回收
*/
struct reclaim_state *reclaim_state;

/*
存放块设备 I/O 数据流量信息
*/
struct backing_dev_info *backing_dev_info;

/*
I/O 调度器所使用的信息
*/
struct io_context *io_context;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use.  */
struct task_io_accounting ioac;

struct compat_robust_list_head __user *compat_robust_list;

/*
rcu 链表
*/
struct rcu_head rcu;

/*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;

/*
* when (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for some dirty throttling pause
*/
int nr_dirtied;
int nr_dirtied_pause;

/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;

/*
 socket 控制消息
*/
struct list_head	*scm_work_list;
};

0x01 data struct

list

struct list_head tasks;内核利用c语言提供的宏模拟泛型,include/linux/list.h里面是双向链表的具体实现.

0x02 synchronization mechanism

atomic

atomic_t usage取自task_struct,atomic是一个同步原语,kernel的实现保证对齐变量的增加或减少是原子操作, 在x86上具体实现是利用lock指令锁住内存总线.

raw_spinlock_t

raw_spinlock_t pi_lockraw_spinlock_t也是内核提供的一个同步原语 (自旋锁), 如果变量被锁, 则 cpu 进入忙等 (不断尝试获得锁).

rcu

struct rcu_head rcurcu(read copy update)也是内核提供的一个同步原语 (RCU 指针), 它是通过指针而不是锁来实现访问共享数据结构.

reference

  1. «UnderStanding The Linux Kernel 3rd Edition» 


PREV     NEXT