您的位置:首页 > 运维架构 > Linux

Linux 1.0 memory.c 学习日记

2013-07-03 20:36 399 查看
/*

* linux/mm/memory.c

*

* Copyright (C) 1991, 1992 Linus Torvalds

*/

/*

* demand-loading started 01.12.91 - seems it is high on the list of

* things wanted, and it should be easy to implement. - Linus

*/

/*

* Ok, demand-loading was easy, shared pages a little bit tricker. Shared

* pages started 02.12.91, seems to work. - Linus.

*

* Tested sharing by executing about 30 /bin/sh: under the old kernel it

* would have taken more than the 6M I have free, but it worked well as

* far as I could see.

*

* Also corrected some "invalidate()"s - I wasn't doing enough of them.

*/

/*

* Real VM (paging to/from disk) started 18.12.91. Much more work and

* thought has to go into this. Oh, well..

* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.

* Found it. Everything seems to work now.

* 20.12.91 - Ok, making the swap-device changeable like the root.

*/

#include <asm/system.h>

#include <linux/config.h>

#include <linux/signal.h>

#include <linux/sched.h>

#include <linux/head.h>

#include <linux/kernel.h>

#include <linux/errno.h>

#include <linux/string.h>

#include <linux/types.h>

#include <linux/ptrace.h>

#include <linux/mman.h>

unsigned long high_memory = 0;

extern unsigned long pg0[1024]; /* page table for 0-4MB for everybody */

extern void sound_mem_init(void);

extern void die_if_kernel(char *,struct pt_regs *,long);

int nr_swap_pages = 0;

int nr_free_pages = 0;

unsigned long free_page_list = 0;

/*

* The secondary free_page_list is used for malloc() etc things that

* may need pages during interrupts etc. Normal get_free_page() operations

* don't touch it, so it stays as a kind of "panic-list", that can be

* accessed when all other mm tricks have failed.

*/

int nr_secondary_pages = 0;

unsigned long secondary_page_list = 0;

#define copy_page(from,to) \

__asm__("cld ; rep ; movsl": :"S" (from),"D" (to),"c" (1024):"cx","di","si")

unsigned short * mem_map = NULL;

#define CODE_SPACE(addr,p) ((addr) < (p)->end_code)

/*

* oom() prints a message (so that the user knows why the process died),

* and gives the process an untrappable SIGSEGV.

*/

void oom(struct task_struct * task)

{

printk("\nout of memory\n");

task->sigaction[SIGKILL-1].sa_handler = NULL;

task->blocked &= ~(1<<(SIGKILL-1));

send_sig(SIGKILL,task,1);

}

static void free_one_table(unsigned long * page_dir)

{

int j;

unsigned long pg_table = *page_dir;

unsigned long * page_table;

if (!pg_table)

return;

*page_dir = 0;

if (pg_table >= high_memory || !(pg_table & PAGE_PRESENT)) {

printk("Bad page table: [%p]=%08lx\n",page_dir,pg_table);

return;

}

if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED)

return;

page_table = (unsigned long *) (pg_table & PAGE_MASK);

for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++) {

unsigned long pg = *page_table;

if (!pg)

continue;

*page_table = 0;

if (pg & PAGE_PRESENT)

free_page(PAGE_MASK & pg);

else

swap_free(pg);

}

free_page(PAGE_MASK & pg_table);

}

/*

* This function clears all user-level page tables of a process - this

* is needed by execve(), so that old pages aren't in the way. Note that

* unlike 'free_page_tables()', this function still leaves a valid

* page-table-tree in memory: it just removes the user pages. The two

* functions are similar, but there is a fundamental difference.

*/

void clear_page_tables(struct task_struct * tsk)

{

int i;

unsigned long pg_dir;

unsigned long * page_dir;

if (!tsk)

return;

if (tsk == task[0])

panic("task[0] (swapper) doesn't support exec()\n");

pg_dir = tsk->tss.cr3;

page_dir = (unsigned long *) pg_dir;

if (!page_dir || page_dir == swapper_pg_dir) {

printk("Trying to clear kernel page-directory: not good\n");

return;

}

if (mem_map[MAP_NR(pg_dir)] > 1) {

unsigned long * new_pg;

if (!(new_pg = (unsigned long*) get_free_page(GFP_KERNEL))) {

oom(tsk);

return;

}

for (i = 768 ; i < 1024 ; i++)

new_pg[i] = page_dir[i];

free_page(pg_dir);

tsk->tss.cr3 = (unsigned long) new_pg;

return;

}

for (i = 0 ; i < 768 ; i++,page_dir++)

free_one_table(page_dir);

invalidate();

return;

}

/*

* This function frees up all page tables of a process when it exits.

*/

void free_page_tables(struct task_struct * tsk)

{

int i;

unsigned long pg_dir;

unsigned long * page_dir;

if (!tsk)

return;

if (tsk == task[0]) {

printk("task[0] (swapper) killed: unable to recover\n");

panic("Trying to free up swapper memory space");

}

pg_dir = tsk->tss.cr3;

if (!pg_dir || pg_dir == (unsigned long) swapper_pg_dir) {

printk("Trying to free kernel page-directory: not good\n");

return;

}

tsk->tss.cr3 = (unsigned long) swapper_pg_dir;

if (tsk == current)

__asm__ __volatile__("movl %0,%%cr3": :"a" (tsk->tss.cr3));

if (mem_map[MAP_NR(pg_dir)] > 1) {

free_page(pg_dir);

return;

}

page_dir = (unsigned long *) pg_dir;

for (i = 0 ; i < PTRS_PER_PAGE ; i++,page_dir++)

free_one_table(page_dir);

free_page(pg_dir);

invalidate();

}

/*

* clone_page_tables() clones the page table for a process - both

* processes will have the exact same pages in memory. There are

* probably races in the memory management with cloning, but we'll

* see..

*/

int clone_page_tables(struct task_struct * tsk)

{

unsigned long pg_dir;

pg_dir = current->tss.cr3;

mem_map[MAP_NR(pg_dir)]++;

tsk->tss.cr3 = pg_dir;

return 0;

}

/*

* copy_page_tables() just copies the whole process memory range:

* note the special handling of RESERVED (ie kernel) pages, which

* means that they are always shared by all processes.

*/

int copy_page_tables(struct task_struct * tsk)

{

int i;

unsigned long old_pg_dir, *old_page_dir;

unsigned long new_pg_dir, *new_page_dir;

if (!(new_pg_dir = get_free_page(GFP_KERNEL)))

return -ENOMEM;

old_pg_dir = current->tss.cr3;

tsk->tss.cr3 = new_pg_dir;

old_page_dir = (unsigned long *) old_pg_dir;

new_page_dir = (unsigned long *) new_pg_dir;

for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) {

int j;

unsigned long old_pg_table, *old_page_table;

unsigned long new_pg_table, *new_page_table;

old_pg_table = *old_page_dir;

if (!old_pg_table)

continue;

if (old_pg_table >= high_memory || !(old_pg_table & PAGE_PRESENT)) {

printk("copy_page_tables: bad page table: "

"probable memory corruption");

*old_page_dir = 0;

continue;

}

if (mem_map[MAP_NR(old_pg_table)] & MAP_PAGE_RESERVED) {

*new_page_dir = old_pg_table;

continue;

}

if (!(new_pg_table = get_free_page(GFP_KERNEL))) {

free_page_tables(tsk);

return -ENOMEM;

}

old_page_table = (unsigned long *) (PAGE_MASK & old_pg_table);

new_page_table = (unsigned long *) (PAGE_MASK & new_pg_table);

for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) {

unsigned long pg;

pg = *old_page_table;

if (!pg)

continue;

if (!(pg & PAGE_PRESENT)) {

*new_page_table = swap_duplicate(pg);

continue;

}

if ((pg & (PAGE_RW | PAGE_COW)) == (PAGE_RW | PAGE_COW))

pg &= ~PAGE_RW;

*new_page_table = pg;

if (mem_map[MAP_NR(pg)] & MAP_PAGE_RESERVED)

continue;

*old_page_table = pg;

mem_map[MAP_NR(pg)]++;

}

*new_page_dir = new_pg_table | PAGE_TABLE;

}

invalidate();

return 0;

}

/*

* a more complete version of free_page_tables which performs with page

* granularity.

*/

int unmap_page_range(unsigned long from, unsigned long size)

{

unsigned long page, page_dir;

unsigned long *page_table, *dir;

unsigned long poff, pcnt, pc;

if (from & ~PAGE_MASK) {

printk("unmap_page_range called with wrong alignment\n");

return -EINVAL;

}

size = (size + ~PAGE_MASK) >> PAGE_SHIFT;

dir = PAGE_DIR_OFFSET(current->tss.cr3,from);

poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);

if ((pcnt = PTRS_PER_PAGE - poff) > size)

pcnt = size;

for ( ; size > 0; ++dir, size -= pcnt,

pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) {

if (!(page_dir = *dir)) {

poff = 0;

continue;

}

if (!(page_dir & PAGE_PRESENT)) {

printk("unmap_page_range: bad page directory.");

continue;

}

page_table = (unsigned long *)(PAGE_MASK & page_dir);

if (poff) {

page_table += poff;

poff = 0;

}

for (pc = pcnt; pc--; page_table++) {

if ((page = *page_table) != 0) {

*page_table = 0;

if (1 & page) {

if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED))

if (current->rss > 0)

--current->rss;

free_page(PAGE_MASK & page);

} else

swap_free(page);

}

}

if (pcnt == PTRS_PER_PAGE) {

*dir = 0;

free_page(PAGE_MASK & page_dir);

}

}

invalidate();

return 0;

}

int zeromap_page_range(unsigned long from, unsigned long size, int mask)

{

unsigned long *page_table, *dir;

unsigned long poff, pcnt;

unsigned long page;

if (mask) {

if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) {

printk("zeromap_page_range: mask = %08x\n",mask);

return -EINVAL;

}

mask |= ZERO_PAGE;

}

if (from & ~PAGE_MASK) {

printk("zeromap_page_range: from = %08lx\n",from);

return -EINVAL;

}

dir = PAGE_DIR_OFFSET(current->tss.cr3,from);

size = (size + ~PAGE_MASK) >> PAGE_SHIFT;

poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);

if ((pcnt = PTRS_PER_PAGE - poff) > size)

pcnt = size;

while (size > 0) {

if (!(PAGE_PRESENT & *dir)) {

/* clear page needed here? SRB. */

if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) {

invalidate();

return -ENOMEM;

}

if (PAGE_PRESENT & *dir) {

free_page((unsigned long) page_table);

page_table = (unsigned long *)(PAGE_MASK & *dir++);

} else

*dir++ = ((unsigned long) page_table) | PAGE_TABLE;

} else

page_table = (unsigned long *)(PAGE_MASK & *dir++);

page_table += poff;

poff = 0;

for (size -= pcnt; pcnt-- ;) {

if ((page = *page_table) != 0) {

*page_table = 0;

if (page & PAGE_PRESENT) {

if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED))

if (current->rss > 0)

--current->rss;

free_page(PAGE_MASK & page);

} else

swap_free(page);

}

*page_table++ = mask;

}

pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);

}

invalidate();

return 0;

}

/*

* maps a range of physical memory into the requested pages. the old

* mappings are removed. any references to nonexistent pages results

* in null mappings (currently treated as "copy-on-access")

*/

int remap_page_range(unsigned long from, unsigned long to, unsigned long size, int mask)

{

unsigned long *page_table, *dir;

unsigned long poff, pcnt;

unsigned long page;

if (mask) {

if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) {

printk("remap_page_range: mask = %08x\n",mask);

return -EINVAL;

}

}

if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) {

printk("remap_page_range: from = %08lx, to=%08lx\n",from,to);

return -EINVAL;

}

dir = PAGE_DIR_OFFSET(current->tss.cr3,from);

size = (size + ~PAGE_MASK) >> PAGE_SHIFT;

poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);

if ((pcnt = PTRS_PER_PAGE - poff) > size)

pcnt = size;

while (size > 0) {

if (!(PAGE_PRESENT & *dir)) {

/* clearing page here, needed? SRB. */

if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) {

invalidate();

return -1;

}

*dir++ = ((unsigned long) page_table) | PAGE_TABLE;

}

else

page_table = (unsigned long *)(PAGE_MASK & *dir++);

if (poff) {

page_table += poff;

poff = 0;

}

for (size -= pcnt; pcnt-- ;) {

if ((page = *page_table) != 0) {

*page_table = 0;

if (PAGE_PRESENT & page) {

if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED))

if (current->rss > 0)

--current->rss;

free_page(PAGE_MASK & page);

} else

swap_free(page);

}

/*

* the first condition should return an invalid access

* when the page is referenced. current assumptions

* cause it to be treated as demand allocation in some

* cases.

*/

if (!mask)

*page_table++ = 0; /* not present */

else if (to >= high_memory)

*page_table++ = (to | mask);

else if (!mem_map[MAP_NR(to)])

*page_table++ = 0; /* not present */

else {

*page_table++ = (to | mask);

if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) {

++current->rss;

mem_map[MAP_NR(to)]++;

}

}

to += PAGE_SIZE;

}

pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);

}

invalidate();

return 0;

}

/*

* This function puts a page in memory at the wanted address.

* It returns the physical address of the page gotten, 0 if

* out of memory (either when trying to access page-table or

* page.)

*/

unsigned long put_page(struct task_struct * tsk,unsigned long page,

unsigned long address,int prot)

{

unsigned long *page_table;

if ((prot & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT)

printk("put_page: prot = %08x\n",prot);

if (page >= high_memory) {

printk("put_page: trying to put page %08lx at %08lx\n",page,address);

return 0;

}

page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address);

if ((*page_table) & PAGE_PRESENT)

page_table = (unsigned long *) (PAGE_MASK & *page_table);

else {

printk("put_page: bad page directory entry\n");

oom(tsk);

*page_table = BAD_PAGETABLE | PAGE_TABLE;

return 0;

}

page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);

if (*page_table) {

printk("put_page: page already exists\n");

*page_table = 0;

invalidate();

}

*page_table = page | prot;

/* no need for invalidate */

return page;

}

/*

* The previous function doesn't work very well if you also want to mark

* the page dirty: exec.c wants this, as it has earlier changed the page,

* and we want the dirty-status to be correct (for VM). Thus the same

* routine, but this time we mark it dirty too.

*/

unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)

{

unsigned long tmp, *page_table;

if (page >= high_memory)

printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);

if (mem_map[MAP_NR(page)] != 1)

printk("mem_map disagrees with %08lx at %08lx\n",page,address);

page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address);

if (PAGE_PRESENT & *page_table)

page_table = (unsigned long *) (PAGE_MASK & *page_table);

else {

if (!(tmp = get_free_page(GFP_KERNEL)))

return 0;

if (PAGE_PRESENT & *page_table) {

free_page(tmp);

page_table = (unsigned long *) (PAGE_MASK & *page_table);

} else {

*page_table = tmp | PAGE_TABLE;

page_table = (unsigned long *) tmp;

}

}

page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);

if (*page_table) {

printk("put_dirty_page: page already exists\n");

*page_table = 0;

invalidate();

}

*page_table = page | (PAGE_DIRTY | PAGE_PRIVATE);

/* no need for invalidate */

return page;

}

/*

* This routine handles present pages, when users try to write

* to a shared page. It is done by copying the page to a new address

* and decrementing the shared-page counter for the old page.

*

* Note that we do many checks twice (look at do_wp_page()), as

* we have to be careful about race-conditions.

*

* Goto-purists beware: the only reason for goto's here is that it results

* in better assembly code.. The "default" path will see no jumps at all.

*/

static void __do_wp_page(unsigned long error_code, unsigned long address,

struct task_struct * tsk, unsigned long user_esp)

{

unsigned long *pde, pte, old_page, prot;

unsigned long new_page;

new_page = __get_free_page(GFP_KERNEL);

pde = PAGE_DIR_OFFSET(tsk->tss.cr3,address);

pte = *pde;

if (!(pte & PAGE_PRESENT))

goto end_wp_page;

if ((pte & PAGE_TABLE) != PAGE_TABLE || pte >= high_memory)

goto bad_wp_pagetable;

pte &= PAGE_MASK;

pte += PAGE_PTR(address);

old_page = *(unsigned long *) pte;

if (!(old_page & PAGE_PRESENT))

goto end_wp_page;

if (old_page >= high_memory)

goto bad_wp_page;

if (old_page & PAGE_RW)

goto end_wp_page;

tsk->min_flt++;

prot = (old_page & ~PAGE_MASK) | PAGE_RW;

old_page &= PAGE_MASK;

if (mem_map[MAP_NR(old_page)] != 1) {

if (new_page) {

if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)

++tsk->rss;

copy_page(old_page,new_page);

*(unsigned long *) pte = new_page | prot;

free_page(old_page);

invalidate();

return;

}

free_page(old_page);

oom(tsk);

*(unsigned long *) pte = BAD_PAGE | prot;

invalidate();

return;

}

*(unsigned long *) pte |= PAGE_RW;

invalidate();

if (new_page)

free_page(new_page);

return;

bad_wp_page:

printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);

*(unsigned long *) pte = BAD_PAGE | PAGE_SHARED;

send_sig(SIGKILL, tsk, 1);

goto end_wp_page;

bad_wp_pagetable:

printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n",address,pte);

*pde = BAD_PAGETABLE | PAGE_TABLE;

send_sig(SIGKILL, tsk, 1);

end_wp_page:

if (new_page)

free_page(new_page);

return;

}

/*

* check that a page table change is actually needed, and call

* the low-level function only in that case..

*/

void do_wp_page(unsigned long error_code, unsigned long address,

struct task_struct * tsk, unsigned long user_esp)

{

unsigned long page;

unsigned long * pg_table;

pg_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address);

page = *pg_table;

if (!page)

return;

if ((page & PAGE_PRESENT) && page < high_memory) {

pg_table = (unsigned long *) ((page & PAGE_MASK) + PAGE_PTR(address));

page = *pg_table;

if (!(page & PAGE_PRESENT))

return;

if (page & PAGE_RW)

return;

if (!(page & PAGE_COW)) {

if (user_esp && tsk == current) {

current->tss.cr2 = address;

current->tss.error_code = error_code;

current->tss.trap_no = 14;

send_sig(SIGSEGV, tsk, 1);

return;

}

}

if (mem_map[MAP_NR(page)] == 1) {

*pg_table |= PAGE_RW | PAGE_DIRTY;

invalidate();

return;

}

__do_wp_page(error_code, address, tsk, user_esp);

return;

}

printk("bad page directory entry %08lx\n",page);

*pg_table = 0;

}

int __verify_write(unsigned long start, unsigned long size)

{

size--;

size += start & ~PAGE_MASK;

size >>= PAGE_SHIFT;

start &= PAGE_MASK;

do {

do_wp_page(1,start,current,0);

start += PAGE_SIZE;

} while (size--);

return 0;

}

static inline void get_empty_page(struct task_struct * tsk, unsigned long address)

{

unsigned long tmp;

if (!(tmp = get_free_page(GFP_KERNEL))) {

oom(tsk);

tmp = BAD_PAGE;

}

if (!put_page(tsk,tmp,address,PAGE_PRIVATE))

free_page(tmp);

}

/*

* try_to_share() checks the page at address "address" in the task "p",

* to see if it exists, and if it is clean. If so, share it with the current

* task.

*

* NOTE! This assumes we have checked that p != current, and that they

* share the same executable or library.

*

* We may want to fix this to allow page sharing for PIC pages at different

* addresses so that ELF will really perform properly. As long as the vast

* majority of sharable libraries load at fixed addresses this is not a

* big concern. Any sharing of pages between the buffer cache and the

* code space reduces the need for this as well. - ERY

*/

static int try_to_share(unsigned long address, struct task_struct * tsk,

struct task_struct * p, unsigned long error_code, unsigned long newpage)

{

unsigned long from;

unsigned long to;

unsigned long from_page;

unsigned long to_page;

from_page = (unsigned long)PAGE_DIR_OFFSET(p->tss.cr3,address);

to_page = (unsigned long)PAGE_DIR_OFFSET(tsk->tss.cr3,address);

/* is there a page-directory at from? */

from = *(unsigned long *) from_page;

if (!(from & PAGE_PRESENT))

return 0;

from &= PAGE_MASK;

from_page = from + PAGE_PTR(address);

from = *(unsigned long *) from_page;

/* is the page clean and present? */

if ((from & (PAGE_PRESENT | PAGE_DIRTY)) != PAGE_PRESENT)

return 0;

if (from >= high_memory)

return 0;

if (mem_map[MAP_NR(from)] & MAP_PAGE_RESERVED)

return 0;

/* is the destination ok? */

to = *(unsigned long *) to_page;

if (!(to & PAGE_PRESENT))

return 0;

to &= PAGE_MASK;

to_page = to + PAGE_PTR(address);

if (*(unsigned long *) to_page)

return 0;

/* share them if read - do COW immediately otherwise */

if (error_code & PAGE_RW) {

if(!newpage) /* did the page exist? SRB. */

return 0;

copy_page((from & PAGE_MASK),newpage);

to = newpage | PAGE_PRIVATE;

} else {

mem_map[MAP_NR(from)]++;

from &= ~PAGE_RW;

to = from;

if(newpage) /* only if it existed. SRB. */

free_page(newpage);

}

*(unsigned long *) from_page = from;

*(unsigned long *) to_page = to;

invalidate();

return 1;

}

/*

* share_page() tries to find a process that could share a page with

* the current one. Address is the address of the wanted page relative

* to the current data space.

*

* We first check if it is at all feasible by checking executable->i_count.

* It should be >1 if there are other tasks sharing this inode.

*/

int share_page(struct vm_area_struct * area, struct task_struct * tsk,

struct inode * inode,

unsigned long address, unsigned long error_code, unsigned long newpage)

{

struct task_struct ** p;

if (!inode || inode->i_count < 2 || !area->vm_ops)

return 0;

for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) {

if (!*p)

continue;

if (tsk == *p)

continue;

if (inode != (*p)->executable) {

if(!area) continue;

/* Now see if there is something in the VMM that

we can share pages with */

if(area){

struct vm_area_struct * mpnt;

for (mpnt = (*p)->mmap; mpnt; mpnt = mpnt->vm_next) {

if (mpnt->vm_ops == area->vm_ops &&

mpnt->vm_inode->i_ino == area->vm_inode->i_ino&&

mpnt->vm_inode->i_dev == area->vm_inode->i_dev){

if (mpnt->vm_ops->share(mpnt, area, address))

break;

};

};

if (!mpnt) continue; /* Nope. Nuthin here */

};

}

if (try_to_share(address,tsk,*p,error_code,newpage))

return 1;

}

return 0;

}

/*

* fill in an empty page-table if none exists.

*/

static inline unsigned long get_empty_pgtable(struct task_struct * tsk,unsigned long address)

{

unsigned long page;

unsigned long *p;

p = PAGE_DIR_OFFSET(tsk->tss.cr3,address);

if (PAGE_PRESENT & *p)

return *p;

if (*p) {

printk("get_empty_pgtable: bad page-directory entry \n");

*p = 0;

}

page = get_free_page(GFP_KERNEL);

p = PAGE_DIR_OFFSET(tsk->tss.cr3,address);

if (PAGE_PRESENT & *p) {

free_page(page);

return *p;

}

if (*p) {

printk("get_empty_pgtable: bad page-directory entry \n");

*p = 0;

}

if (page) {

*p = page | PAGE_TABLE;

return *p;

}

oom(current);

*p = BAD_PAGETABLE | PAGE_TABLE;

return 0;

}

void do_no_page(unsigned long error_code, unsigned long address,

struct task_struct *tsk, unsigned long user_esp)

{

unsigned long tmp;

unsigned long page;

struct vm_area_struct * mpnt;

page = get_empty_pgtable(tsk,address);

if (!page)

return;

page &= PAGE_MASK;

page += PAGE_PTR(address);

tmp = *(unsigned long *) page;

if (tmp & PAGE_PRESENT)

return;

++tsk->rss;

if (tmp) {

++tsk->maj_flt;

swap_in((unsigned long *) page);

return;

}

address &= 0xfffff000;

tmp = 0;

for (mpnt = tsk->mmap; mpnt != NULL; mpnt = mpnt->vm_next) {

if (address < mpnt->vm_start)

break;

if (address >= mpnt->vm_end) {

tmp = mpnt->vm_end;

continue;

}

if (!mpnt->vm_ops || !mpnt->vm_ops->nopage) {

++tsk->min_flt;

get_empty_page(tsk,address);

return;

}

mpnt->vm_ops->nopage(error_code, mpnt, address);

return;

}

if (tsk != current)

goto ok_no_page;

if (address >= tsk->end_data && address < tsk->brk)

goto ok_no_page;

if (mpnt && mpnt == tsk->stk_vma &&

address - tmp > mpnt->vm_start - address &&

tsk->rlim[RLIMIT_STACK].rlim_cur > mpnt->vm_end - address) {

mpnt->vm_start = address;

goto ok_no_page;

}

tsk->tss.cr2 = address;

current->tss.error_code = error_code;

current->tss.trap_no = 14;

send_sig(SIGSEGV,tsk,1);

if (error_code & 4) /* user level access? */

return;

ok_no_page:

++tsk->min_flt;

get_empty_page(tsk,address);

}

/*

* This routine handles page faults. It determines the address,

* and the problem, and then passes it off to one of the appropriate

* routines.

*/

asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)

{

unsigned long address;

unsigned long user_esp = 0;

unsigned int bit;

/* get the address */

__asm__("movl %%cr2,%0":"=r" (address));

if (address < TASK_SIZE) {

if (error_code & 4) { /* user mode access? */

if (regs->eflags & VM_MASK) {

bit = (address - 0xA0000) >> PAGE_SHIFT;

if (bit < 32)

current->screen_bitmap |= 1 << bit;

} else

user_esp = regs->esp;

}

if (error_code & 1)

do_wp_page(error_code, address, current, user_esp);

else

do_no_page(error_code, address, current, user_esp);

return;

}

address -= TASK_SIZE;

if (wp_works_ok < 0 && address == 0 && (error_code & PAGE_PRESENT)) {

wp_works_ok = 1;

pg0[0] = PAGE_SHARED;

printk("This processor honours the WP bit even when in supervisor mode. Good.\n");

return;

}

if (address < PAGE_SIZE) {

printk("Unable to handle kernel NULL pointer dereference");

pg0[0] = PAGE_SHARED;

} else

printk("Unable to handle kernel paging request");

printk(" at address %08lx\n",address);

die_if_kernel("Oops", regs, error_code);

do_exit(SIGKILL);

}

/*

* BAD_PAGE is the page that is used for page faults when linux

* is out-of-memory. Older versions of linux just did a

* do_exit(), but using this instead means there is less risk

* for a process dying in kernel mode, possibly leaving a inode

* unused etc..

*

* BAD_PAGETABLE is the accompanying page-table: it is initialized

* to point to BAD_PAGE entries.

*

* ZERO_PAGE is a special page that is used for zero-initialized

* data and COW.

*/

unsigned long __bad_pagetable(void)

{

extern char empty_bad_page_table[PAGE_SIZE];

__asm__ __volatile__("cld ; rep ; stosl":

:"a" (BAD_PAGE + PAGE_TABLE),

"D" ((long) empty_bad_page_table),

"c" (PTRS_PER_PAGE)

:"di","cx");

return (unsigned long) empty_bad_page_table;

}

unsigned long __bad_page(void)

{

extern char empty_bad_page[PAGE_SIZE];

__asm__ __volatile__("cld ; rep ; stosl":

:"a" (0),

"D" ((long) empty_bad_page),

"c" (PTRS_PER_PAGE)

:"di","cx");

return (unsigned long) empty_bad_page;

}

unsigned long __zero_page(void)

{

extern char empty_zero_page[PAGE_SIZE];

__asm__ __volatile__("cld ; rep ; stosl":

:"a" (0),

"D" ((long) empty_zero_page),

"c" (PTRS_PER_PAGE)

:"di","cx");

return (unsigned long) empty_zero_page;

}

void show_mem(void)

{

int i,free = 0,total = 0,reserved = 0;

int shared = 0;

printk("Mem-info:\n");

printk("Free pages: %6dkB\n",nr_free_pages<<(PAGE_SHIFT-10));

printk("Secondary pages: %6dkB\n",nr_secondary_pages<<(PAGE_SHIFT-10));

printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));

i = high_memory >> PAGE_SHIFT;

while (i-- > 0) {

total++;

if (mem_map[i] & MAP_PAGE_RESERVED)

reserved++;

else if (!mem_map[i])

free++;

else

shared += mem_map[i]-1;

}

printk("%d pages of RAM\n",total);

printk("%d free pages\n",free);

printk("%d reserved pages\n",reserved);

printk("%d pages shared\n",shared);

show_buffers();

}

/*

* paging_init() sets up the page tables - note that the first 4MB are

* already mapped by head.S.

*

* This routines also unmaps the page at virtual kernel address 0, so

* that we can trap those pesky NULL-reference errors in the kernel.

*/

//从0xC0000000~0x3FFFFFFF的1GB虚拟空间为内核态空间。所以,如果物理内存足够大,也最多只能映射1GB的物理内存。

//内核从0xc0000000(即3G)开始的那一段虚拟地址都是是连续映射到从0开始的物理地址的。

//在此版本的kernel中,物理内存的最大值假定为16M。所以,只能从0xC0000000到0xC0000000+16M的虚拟地址,映射到0到16M的物理地址。

//从内核虚拟地址0xc0000000开始的16M虚拟地址都是连续映射到从0~16M的的物理地址的。

//所以显然可知,内核物理地址加上0xC0000000即得到内核虚拟地址;内核虚拟地址减去0xC0000000得到内核物理地址。

unsigned long paging_init(unsigned long start_mem, unsigned long end_mem)

{

unsigned long * pg_dir;

unsigned long * pg_table;

unsigned long tmp;

unsigned long address;

/*

* Physical page 0 is special; it's not touched by Linux since BIOS

* and SMM (for laptops with [34]86/SL chips) may need it. It is read

* and write protected to detect null pointer references in the

* kernel.

*/

#if 0

memset((void *) 0, 0, PAGE_SIZE);

#endif

start_mem = PAGE_ALIGN(start_mem); //start_mem为内核源代码的end物理地址的next物理地址

address = 0; //映射的物理地址是从0开始的

pg_dir = swapper_pg_dir; //swapper_pg_dir为head.S中建立的页目录表,它的0项和768项都是页_pg0的地址

while (address < end_mem) {

tmp = *(pg_dir + 768); /* at virtual addr 0xC0000000 */

if (!tmp) {

tmp = start_mem | PAGE_TABLE;

*(pg_dir + 768) = tmp;//即从内核源代码的end物理地址的next物理地址开始分配二级页表

start_mem += PAGE_SIZE;//二级页表的大小为4096

}

//页目录表中,第0项和第768项都指向页_pg0(对应0~4M的物理地址);第1项和769项都指向新分配的二级页表(对应4~8M的物理地址);后面依次类推,直到16M内存map完了。

*pg_dir = tmp; /* also map it in at 0x0000000 for init */

pg_dir++;

pg_table = (unsigned long *) (tmp & PAGE_MASK);//tmp为二级页表的地址和页表属性相加的结果,所以用PAGE_MASK去掉页表属性即得到二级页表的地址。

//PTRS_PER_PAGE为1024,所以为新的二级页表pg_table的1024个项分配一个物理页的首地址+页属性。

//对于第768项,它指向_pg0,所对应的0~4M的物理地址在head.S中已经map过了,这里又重新map了下,不过没关系,还是与head.S中一样映射到0~4M的物理地址。这里paging_init中真正map的就是4~16M的物理内存。

for (tmp = 0 ; tmp < PTRS_PER_PAGE ; tmp++,pg_table++) {

if (address < end_mem) //如果物理地址16M还没有分配完

*pg_table = address | PAGE_SHARED;

else

*pg_table = 0;

address += PAGE_SIZE; //每分配一项,物理地址就减少4096

}

}

invalidate();

return start_mem;

}

//mem_init()告诉我们一共有多少内存,并且标示出其中的空闲可用内存区域等信息。

//start_low_mem为地址1M之前的可用内存的起始地址(即内核代码段之前的内存);start_mem位内核代码段之后的可用内存的起始地址;

//end_mem为内核代码段之后的可用内存的最大地址。

void mem_init(unsigned long start_low_mem,

unsigned long start_mem, unsigned long end_mem)

{

int codepages = 0;

int reservedpages = 0;

int datapages = 0;

unsigned long tmp;

unsigned short * p;

extern int etext;

cli();

end_mem &= PAGE_MASK;

high_memory = end_mem;

start_mem += 0x0000000f;

start_mem &= ~0x0000000f;

tmp = MAP_NR(end_mem); //tmp为根据最大内存地址算出来的内存页数

mem_map = (unsigned short *) start_mem;//mem_map数组的开始地址为当前可用内存的开始地址,即这里分配当前可用内存的开头一部分作为mem_map数组

p = mem_map + tmp; //start_mem开始的tmp*4个字节分配为mem_map数组

start_mem = (unsigned long) p;//start_mem的新起始地址就是start_mem加上tmp*4

while (p > mem_map) //将数组mem_map的tmp个元素一一赋值为MAP_PAGE_RESERVED

*--p = MAP_PAGE_RESERVED;

start_low_mem = PAGE_ALIGN(start_low_mem);

start_mem = PAGE_ALIGN(start_mem);

while (start_low_mem < 0xA0000) { //在0x1000~0xA0000内存是可用的

mem_map[MAP_NR(start_low_mem)] = 0; //0x1000/PAGE_SIZE刚好得到0x1000物理地址所对应的mem_map里的index为1,即刚好是4~8M内存内存。mem_map[0]即是对应0~4k内存。

start_low_mem += PAGE_SIZE; //映射一页后低的可用内存减少PAGE_SIZE大小

}

while (start_mem < end_mem) { //当前可用内存的开始地址为内核代码后除去二级页表和mem_map数组等占用的内存后的起始地址

mem_map[MAP_NR(start_mem)] = 0;//start_mem/PAGE_SIZE得到该地址所对应的第几个物理页,及时mem_map中的index

start_mem += PAGE_SIZE;//映射一页后可用内存减少PAGE_SIZE大小

}

#ifdef CONFIG_SOUND

sound_mem_init();

#endif

free_page_list = 0;

nr_free_pages = 0;

for (tmp = 0 ; tmp < end_mem ; tmp += PAGE_SIZE) {

if (mem_map[MAP_NR(tmp)]) {//0~16M内存中,除去0x1000~0xA0000和start_mem ~ end_mem之间的物理页可用外,其它都为MAP_PAGE_RESERVED,不可用

if (tmp >= 0xA0000 && tmp < 0x100000) //0xA0000~0x100000为显示内存区和BIOS中断处理程序区,是保留的区域

reservedpages++;

else if (tmp < (unsigned long) &etext) //etext应该是内核代码的end地址,所以0x100000~etext为内核代码页

codepages++;

else //否则的话都算作是数据代码页,像0x0~0x1000为中断向量表和BIOS数据区,内核代码段后面的二级页表数组和mem_map数组等内存都是数据页了。

datapages++;

continue;

}

*(unsigned long *) tmp = free_page_list;

free_page_list = tmp;

nr_free_pages++;

}

tmp = nr_free_pages << PAGE_SHIFT;

printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data)\n",

tmp >> 10,

end_mem >> 10,

codepages << (PAGE_SHIFT-10),

reservedpages << (PAGE_SHIFT-10),

datapages << (PAGE_SHIFT-10));

/* test if the WP bit is honoured in supervisor mode */

wp_works_ok = -1;

pg0[0] = PAGE_READONLY;

invalidate();

__asm__ __volatile__("movb 0,%%al ; movb %%al,0": : :"ax", "memory");

pg0[0] = 0;

invalidate();

if (wp_works_ok < 0)

wp_works_ok = 0;

return;

}

void si_meminfo(struct sysinfo *val)

{

int i;

i = high_memory >> PAGE_SHIFT;

val->totalram = 0;

val->freeram = 0;

val->sharedram = 0;

val->bufferram = buffermem;

while (i-- > 0) {

if (mem_map[i] & MAP_PAGE_RESERVED)

continue;

val->totalram++;

if (!mem_map[i]) {

val->freeram++;

continue;

}

val->sharedram += mem_map[i]-1;

}

val->totalram <<= PAGE_SHIFT;

val->freeram <<= PAGE_SHIFT;

val->sharedram <<= PAGE_SHIFT;

return;

}

/* This handles a generic mmap of a disk file */

void file_mmap_nopage(int error_code, struct vm_area_struct * area, unsigned long address)

{

struct inode * inode = area->vm_inode;

unsigned int block;

unsigned long page;

int nr[8];

int i, j;

int prot = area->vm_page_prot;

address &= PAGE_MASK;

block = address - area->vm_start + area->vm_offset;

block >>= inode->i_sb->s_blocksize_bits;

page = get_free_page(GFP_KERNEL);

if (share_page(area, area->vm_task, inode, address, error_code, page)) {

++area->vm_task->min_flt;

return;

}

++area->vm_task->maj_flt;

if (!page) {

oom(current);

put_page(area->vm_task, BAD_PAGE, address, PAGE_PRIVATE);

return;

}

for (i=0, j=0; i< PAGE_SIZE ; j++, block++, i += inode->i_sb->s_blocksize)

nr[j] = bmap(inode,block);

if (error_code & PAGE_RW)

prot |= PAGE_RW | PAGE_DIRTY;

page = bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, prot);

if (!(prot & PAGE_RW)) {

if (share_page(area, area->vm_task, inode, address, error_code, page))

return;

}

if (put_page(area->vm_task,page,address,prot))

return;

free_page(page);

oom(current);

}

void file_mmap_free(struct vm_area_struct * area)

{

if (area->vm_inode)

iput(area->vm_inode);

#if 0

if (area->vm_inode)

printk("Free inode %x:%d (%d)\n",area->vm_inode->i_dev,

area->vm_inode->i_ino, area->vm_inode->i_count);

#endif

}

/*

* Compare the contents of the mmap entries, and decide if we are allowed to

* share the pages

*/

int file_mmap_share(struct vm_area_struct * area1,

struct vm_area_struct * area2,

unsigned long address)

{

if (area1->vm_inode != area2->vm_inode)

return 0;

if (area1->vm_start != area2->vm_start)

return 0;

if (area1->vm_end != area2->vm_end)

return 0;

if (area1->vm_offset != area2->vm_offset)

return 0;

if (area1->vm_page_prot != area2->vm_page_prot)

return 0;

return 1;

}

struct vm_operations_struct file_mmap = {

NULL, /* open */

file_mmap_free, /* close */

file_mmap_nopage, /* nopage */

NULL, /* wppage */

file_mmap_share, /* share */

NULL, /* unmap */

};
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: