您的位置：首页 > 其它

文件系统相关知识以及根据文件系统原理设计内核缓冲区

2012-08-17 23:15 459 查看

进程fd与具体的物理文件是如何联系起来的？

与进程相关的为fd和file_struct

后者有个file结构数组，代表内核中打开的文件实例。

每个file结构里的pos代表本次打开的实例，以及目前读取写入的偏移。

他们的关系可以概括为：

struct file_struct{

struct file* fd_array[MAX_FD];

}

fd是个指针，用来跟踪fd_array里的元素。

而 file结构与inode的关系为多对1，

具体的file对应inode映射为file->address_space->inode

除了file结构可以对应到inode，还有另外一个重要结构dentry可以映射到inode

dentry里的d_inode字段指向具体的inode, d_parent指向父目录的dentry，d_child指向

子目录项链表头，有了父指针，子指针和本身inode指针，我们就可以定位出目录项的具体

位置。那么dentry与inode是1对1关系吗？答案是否，在某些情况下，如硬链接就会出现多个

目录项对应一个inode的情况。

关于mount的意义，即为了生成一个vfsmount对象，并在vfsmount对象里生成一个super block内存对象，

super block里包含多种操作，每次操作open一个文件时生成一个file结构，并把file结构里inode的读写操作

赋值为super block里的操作。

如果系统中已经mount了这个设备的文件系统（注意，是这种文件类型的设备），那么super block就不生成新的，

直接返回此super block；在大多数情况下，之前没有mount过这个设备，则会生成一个super block对象，

并调用这个被打开的块设备上的底层函数，访问磁盘上的超级块信息，

填充新super block字段，最后将此super block的s_instances钩子挂到系统的file_system_type的fs_supers上，

这个fs_supers代表的是同一种文件系统类型的超级块对象链表头，例如/dev/sda1和/dev/sda2都是ext3格式，
然后相继被挂载，生成两个super block对象链接在ext3 filesystem的fs_supers上。

借鉴文件系统page cache的设计思想，我们可以设计内核缓冲区，在高端内存分配页框，

通过kmap_atomic访问，从而达到在物理地址上不连续，

但提供给用户的接口是连续地址。

顺带说明一下kmap，kmap设置的是内核主页表，所有进程共享内核主页表

（具体的流程就是fork时拷贝了内核主页表的pgd条目（可理解为指针）。至于具体的pgd条目，指向的都是共享的pmd,pte）

kmap在系统初始化时，就会一直分配到pte级，

所以后面fork出来的进程访问kmap空间都不缺页，

但vmalloc是会重新生成新的pgd条目，

所有后面的进程内核空间里没有这个地址空间，就会缺页。

那么如果有多个进程同时执行kmap_atomic会不会冲突呢？答案是不会。

因为虽然多个kmap_atomic流程都尝试去修改内核主页表，但是kmap_atomic

获取的虚拟地址是每个cpu互相不冲突的，因为虚拟地址不冲突，从而修改的主页表

的pte位置也不同，也就没有同步的必要。

完整的代码如下：

/*
*  mm/critical_buf.c
*
*  Kernel critical buffer
*
*  Simple critical buffer
*  for non-blockable environment
*  there should only exist one instance at one time
*
*  Copyright (C) 1985-2012  ZTE
*
*  2012-08-21  created
*  chenyu105 at gmail dot com
*/

#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/cpu.h>
#include <linux/critical_buf.h>

static critical_cmd global_critical_cmd;
static percpu_pagecache buf_pagecache[NR_CPUS];

void reset_cpu_buf_pos(int cpu)
{
buf_pagecache[cpu].read_pos = 0;
buf_pagecache[cpu].write_pos = 0;
buf_pagecache[cpu].over_flow = 0;
buf_pagecache[cpu].last_writed_pos = 0;
}
EXPORT_SYMBOL(reset_cpu_buf_pos);

static int alloc_buf_on_cpu(int cpu,unsigned long size)
{
int i = 0;
struct page* page = NULL;
unsigned long max_buf_pfn = size>>PAGE_SHIFT;
/*
* buf in use
*/
if(buf_pagecache[cpu].pagecache_array!=NULL)
goto out;

retry:
buf_pagecache[cpu].pagecache_array = (struct page**)
kmalloc(max_buf_pfn*sizeof(struct page*),GFP_KERNEL);
if(buf_pagecache[cpu].pagecache_array == NULL){
printk("alloc array failed,retry\n");
goto retry;
}
for(;i<max_buf_pfn;++i){
page = alloc_pages(GFP_HIGHUSER|__GFP_COLD, 0);
if(page == NULL){
printk("alloc page failed,retry\n");
i--;
continue;
}
buf_pagecache[cpu].pagecache_array[i] = page;
}
out:
reset_cpu_buf_pos(cpu);
return 0;
}

static int free_buf_on_cpu(int cpu,unsigned long size)
{
int i = 0;
unsigned long max_buf_pfn = size>>PAGE_SHIFT;
struct page* page = NULL;

reset_cpu_buf_pos(cpu);

for(;i<max_buf_pfn;++i){
page = buf_pagecache[cpu].pagecache_array[i];
if(page != NULL){
__free_pages(page,0);
buf_pagecache[cpu].pagecache_array[i] = NULL;
}
}

if(buf_pagecache[cpu].pagecache_array!=NULL){
kfree(buf_pagecache[cpu].pagecache_array);
buf_pagecache[cpu].pagecache_array = NULL;
}
return 0;
}

static void  byte_memcpy(void * to, const void * from, size_t n)
{
const char *c_from = from;
char *c_to = to;
while (n-- > 0)
*c_to++ = *c_from++;
}

/*
* steal from generic_perform_write
*/
static int generic_write_buf_cpu(int cpu,char* outbuf,
unsigned long len,unsigned long* ppos)
{
unsigned long pos = 0; /*last write pos within buf*/
unsigned long index = 0;/*pfn within buf*/
unsigned long offset = 0; /*offset within page*/
unsigned long writed = 0; /*writed within outbuf*/
pos = *ppos;
while(len){
struct page *page;
unsigned long bytes;
char *kaddr;

index = pos>>PAGE_SHIFT;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
len);

page = buf_pagecache[cpu].pagecache_array[index];

kaddr = kmap_atomic(page, KM_USER0);
byte_memcpy(kaddr + offset, outbuf+writed, bytes);
kunmap_atomic(kaddr, KM_USER0);

pos +=bytes;
len -=bytes;
writed +=bytes;
};
*ppos = pos;
return writed;
}

/*
* steal from do_generic_file_read
*/
static int generic_read_buf_cpu(int cpu,char* inbuf,
unsigned long len,unsigned long* ppos)
{
unsigned long pos = 0; /*pos within buf*/
unsigned long index = 0;/*pfn within buf*/
unsigned long offset = 0; /*offset within page*/
unsigned long readed = 0; /*readed within inbuf*/

pos = *ppos;
while(len){
struct page *page;
unsigned long bytes;
char *kaddr;

index  = pos>>PAGE_SHIFT;
offset = (pos & (PAGE_SIZE - 1));
bytes  = min_t(unsigned long, PAGE_SIZE - offset,
len);

page   = buf_pagecache[cpu].pagecache_array[index];

kaddr  = kmap_atomic(page, KM_USER0);
byte_memcpy(inbuf+readed, kaddr+offset, bytes);
kunmap_atomic(kaddr, KM_USER0);

pos +=bytes;
len -=bytes;
readed +=bytes;
};
*ppos = pos;
return readed;
}

/*
* force = 1, wrapped writing
* force = 0, write limited to buf size
*/
int critical_write_buf_cpu(int cpu,char* outbuf,
unsigned long len,int force)
{
unsigned long *last_ppos  = &buf_pagecache[cpu].write_pos;
unsigned long last_pos    = *last_ppos;
unsigned long this_writed = 0;

if(force){

write_continue:
if(last_pos+len>global_critical_cmd.buf_size){
/*
* wrapped writing
*/
int left = global_critical_cmd.buf_size - last_pos;
buf_pagecache[cpu].over_flow = 1;
this_writed +=generic_write_buf_cpu(cpu,outbuf,left,last_ppos);
*last_ppos = 0;
last_pos = 0;
len -=left;
outbuf +=left;
goto write_continue;

}else
this_writed += generic_write_buf_cpu(cpu,outbuf,len,last_ppos);
}else{
if(last_pos+len>global_critical_cmd.buf_size){

int left = global_critical_cmd.buf_size - last_pos;
this_writed +=generic_write_buf_cpu(cpu,outbuf,left,last_ppos);
}else{
this_writed +=generic_write_buf_cpu(cpu,outbuf,len,last_ppos);
}
}
return this_writed;
}
EXPORT_SYMBOL(critical_write_buf_cpu);

/*
* force = 1, wrapped reading
* force = 0, start from writed if not writed wrapped,
* 	otherwise start from readed to writed
*/
int critical_read_buf_cpu(int cpu,char* inbuf,
unsigned long len,int force)
{
unsigned long* writedp = &buf_pagecache[cpu].write_pos;
unsigned long* readedp = &buf_pagecache[cpu].read_pos;
unsigned long* last_writedp = &buf_pagecache[cpu].last_writed_pos;
unsigned long  writed  = *writedp;
unsigned long  last_writed = *last_writedp;
unsigned long  readed  = *readedp;
unsigned long  to_read;
unsigned long  this_readed = 0;

if(force){

read_continue:
if(readed + len > global_critical_cmd.buf_size){
/*
* wrapped reading
*/
to_read = global_critical_cmd.buf_size - readed;
this_readed +=generic_read_buf_cpu(cpu,inbuf,
to_read,readedp);
*readedp = 0;
readed  = 0;
inbuf  += to_read;
len    -=to_read;
goto read_continue;

}else
this_readed +=generic_read_buf_cpu(cpu,inbuf,
len,readedp);
}else{

if(buf_pagecache[cpu].over_flow){
/*
* If writed pos changed,update readed pos,
* we are supposed to read the oldest data.
*/
if(writed != last_writed){
*last_writedp = writed;
*readedp  = writed;
readed   = writed;
}
goto read_continue;
}else{
this_readed +=generic_read_buf_cpu(cpu,inbuf,
min_t(unsigned long,len,writed-readed),readedp);
}
}
return this_readed;
}
EXPORT_SYMBOL(critical_read_buf_cpu);

int critical_initial_buffer(critical_cmd* cmd)
{
int i = 0;
cpumask_t cpumask = cmd->cpu_critical_map;
unsigned long size = cmd->buf_size;

unsigned long max_buf_size =
(MAX_KMALLOC_SIZE/sizeof(unsigned long))*PAGE_SIZE;
if(size>max_buf_size)
size = max_buf_size;

global_critical_cmd.cpu_critical_map = cpumask;
global_critical_cmd.buf_size = size;

for_each_cpu_mask(i,cpumask)
alloc_buf_on_cpu(i,size);
return 0;
}
EXPORT_SYMBOL(critical_initial_buffer);

int critical_free_buffers(void)
{
int i = 0;

for_each_cpu_mask(i,global_critical_cmd.cpu_critical_map)
free_buf_on_cpu(i,global_critical_cmd.buf_size);

return 0;
}
EXPORT_SYMBOL(critical_free_buffers);

int critical_cpu_has_buf(int cpu)
{
return cpu_isset(cpu,global_critical_cmd.cpu_critical_map);
}
EXPORT_SYMBOL(critical_cpu_has_buf);

void critical_reset_buf_pos(void)
{
int i = 0;
for_each_cpu_mask(i,global_critical_cmd.cpu_critical_map)
reset_cpu_buf_pos(i);
}
EXPORT_SYMBOL(critical_reset_buf_pos);

/*
*  kernel/sched_monitor.c
*
*  Kernel scheduler switch info
*
*  Copyright (C) 1985-2012  ZTE
*
*  2012-08-21  created
*  chenyu105 at gmail dot com
*
*/

#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/cpuset.h>
#include <linux/kallsyms.h>
#include <linux/critical_buf.h>

#include "rtmutex_common.h"

typedef void (*hook_func)(struct task_struct* prev,
struct task_struct* next);
static hook_func callback_func = NULL;

static int system_recording = 0;
static DEFINE_RWLOCK(hook_rwlock);

static unsigned long task_switchin_time[NR_CPUS];

static const char *task_state_array[] = {
"R (running)",		/*  0 */
"S (sleeping)",		/*  1 */
"D (disk sleep)",	/*  2 */
"T (stopped)",		/*  4 */
"T (tracing stop)",	/*  8 */
"Z (zombie)",		/* 16 */
"X (dead)"		/* 32 */
};

static inline const char * get_task_state(struct task_struct *tsk)
{
unsigned int state = (tsk->state & (TASK_RUNNING |
TASK_INTERRUPTIBLE |
TASK_UNINTERRUPTIBLE |
TASK_STOPPED |
TASK_TRACED)) |
(tsk->exit_state & (EXIT_ZOMBIE |
EXIT_DEAD));
const char **p = &task_state_array[0];

while (state) {
p++;
state >>= 1;
}
return *p;
}

void sched_hook_internal(struct task_struct* prev,
struct task_struct* next)
{

/* flag and rwlock cooperate
* to make high prio writer.
* be careful.
*/
int cpu = task_cpu(prev);
if(system_recording&&
critical_cpu_has_buf(cpu)){
if (read_trylock(&hook_rwlock)){
if(callback_func!=NULL)
callback_func(prev,next);
read_unlock(&hook_rwlock);
}
}

}
EXPORT_SYMBOL(sched_hook_internal);

/*
*  For external use
*/

int sched_sprint_symbol(char *buffer, unsigned long addr)
{
return  sprint_symbol(buffer,addr);
}
EXPORT_SYMBOL(sched_sprint_symbol);

/*
*  Weird?
*/
unsigned long sched_get_task_prevtime(struct task_struct* prev)
{
unsigned long delta = 0;
int cpu = task_cpu(prev);

if(task_switchin_time[cpu]==0){
task_switchin_time[cpu] = sched_clock();
return 0;
}

delta  = sched_clock() - task_switchin_time[cpu];
task_switchin_time[cpu] = sched_clock();

return delta;
}
EXPORT_SYMBOL(sched_get_task_prevtime);

int sched_rec_task_block_info(struct task_struct *task,char* buffer)
{
struct task_struct *lock_owner = NULL;
int len = 0;

if(((task->pi_blocked_on)!=NULL)&&
((((struct rt_mutex_waiter *)(task->pi_blocked_on))->lock)!=NULL)
&&((((task->pi_blocked_on)->lock)->owner)!=NULL))
{
lock_owner = rt_mutex_owner(task->pi_blocked_on->lock);
len += sprintf(buffer+len,"blocked by task:\n");
len += sprintf(buffer+len,"%d (%s)  %c  %d   %lu\n",
lock_owner->pid,lock_owner->comm,
*get_task_state(lock_owner),task_cpu(lock_owner),
lock_owner->rt_priority);
}
return len;
}
EXPORT_SYMBOL(sched_rec_task_block_info);

struct pt_regs * sched_task_pt_regs(struct task_struct* prev)
{
#ifndef task_pt_regs
/* Work-around for PPC */
#define task_pt_regs(task) (task->thread.regs)
#endif
return (struct pt_regs *)task_pt_regs(prev);
}
EXPORT_SYMBOL(sched_task_pt_regs);

int sched_smp_processor_id(void)
{
return smp_processor_id();
}
EXPORT_SYMBOL(sched_smp_processor_id);

void sched_switchHook_hook_regist(hook_func hook)
{
system_recording = 0;
write_lock(&hook_rwlock);
callback_func = hook;
write_unlock(&hook_rwlock);
}
EXPORT_SYMBOL(sched_switchHook_hook_regist);

void sched_switchHook_hook_delete(void)
{
system_recording = 0;
write_lock(&hook_rwlock);
callback_func = NULL;
write_unlock(&hook_rwlock);
critical_free_buffers();
}
EXPORT_SYMBOL(sched_switchHook_hook_delete);

void sched_switchHook_hook_start(critical_cmd* cmd)
{
critical_initial_buffer(cmd);
system_recording = 1;
}
EXPORT_SYMBOL(sched_switchHook_hook_start);

void sched_switchHook_hook_stop(void)
{
system_recording = 0;
}
EXPORT_SYMBOL(sched_switchHook_hook_stop);

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： file struct user system ext cache

相关文章推荐

新的分享

章节导航

文件系统相关知识 以及根据文件系统原理设计内核缓冲区

文件系统相关知识以及根据文件系统原理设计内核缓冲区