2021 MIT6.S081 Copy-on-Write Fork for xv6

This is the 2021 version.

This part of the 2021 version is slightly different from the 2020 version.

COW presumably should be familiar to the students of the subject class, so this article focuses on the operation process without much analysis. For details, please refer to https://pdos.csail.mit.edu/6.828/2021/labs/cow.html

Let's start experimenting.

Switch to the COW branch

git fetch
git checkout cow
make clean

First, let's look at what the fork function is doing (the function is in kernel/proc.c):

int
fork(void)
{
  int i, pid;
  struct proc *np;
  struct proc *p = myproc();

  // Allocate process.
  if((np = allocproc()) == 0){
    return -1;
  }

  // Copy user memory from parent to child.
  if(uvmcopy(p->pagetable, np->pagetable, p->sz) < 0){
    freeproc(np);
    release(&np->lock);
    return -1;
  }
  np->sz = p->sz;

  // copy saved user registers.
  *(np->trapframe) = *(p->trapframe);

  // Cause fork to return 0 in the child.
  np->trapframe->a0 = 0;

  // increment reference counts on open file descriptors.
  for(i = 0; i < NOFILE; i++)
    if(p->ofile[i])
      np->ofile[i] = filedup(p->ofile[i]);
  np->cwd = idup(p->cwd);

  safestrcpy(np->name, p->name, sizeof(p->name));

  pid = np->pid;

  release(&np->lock);

  acquire(&wait_lock);
  np->parent = p;
  release(&wait_lock);

  acquire(&np->lock);
  np->state = RUNNABLE;
  release(&np->lock);

  return pid;
}

It can be found that the main modification of copy-on-write is the uvmcopy function, which changes the original direct copy to the reference counting mode.

Therefore, the whole experimental procedure is as follows:

First add in kernel/riscv.h

#define PTE_C (1L << 8) // COW

If you are not familiar with PTE, you can review the previous experiments.

Then add the following to the very beginning of kernel/kalloc.c:

note: Because there is no multi-threading involved here, there is no part of the test point, so there is no locking part. If it is multi-threaded, it should be locked. After all, this reference count is a shared variable.

//KERNBASE == 0x80000000
static int reference_count[(PHYSTOP - KERNBASE) / PGSIZE];

static int idx_rc(uint64 pa){ //get reference cnt
    return (pa - KERNBASE) / PGSIZE;
}
void add_rc(uint64 pa){ // add
    reference_count[idx_rc(pa)]++;
}
void sub_rc(uint64 pa){ // sub
    reference_count[idx_rc(pa)]--;
}

Then modify the uvmcopy function in kernel/vm.c.

The function turns out to be:

int
uvmcopy(pagetable_t old, pagetable_t new, uint64 sz)
{
  pte_t *pte;
  uint64 pa, i;
  uint flags;
  char *mem;

  for(i = 0; i < sz; i += PGSIZE){
    if((pte = walk(old, i, 0)) == 0)
      panic("uvmcopy: pte should exist");
    if((*pte & PTE_V) == 0)
      panic("uvmcopy: page not present");
    pa = PTE2PA(*pte);
    flags = PTE_FLAGS(*pte);
    if((mem = kalloc()) == 0)
      goto err;
    memmove(mem, (char*)pa, PGSIZE);
    if(mappages(new, i, PGSIZE, (uint64)mem, flags) != 0){
      kfree(mem);
      goto err;
    }
  }
  return 0;

 err:
  uvmunmap(new, 0, i / PGSIZE, 1);
  return -1;
}

Instead of copying the page directly, instead of increasing the reference count, change it to the following:

int
uvmcopy(pagetable_t old, pagetable_t new, uint64 sz)
{
  pte_t *pte;
  uint64 pa, i;
  uint flags;

  for(i = 0; i < sz; i += PGSIZE){
    if((pte = walk(old, i, 0)) == 0)
      panic("uvmcopy: pte should exist");
    if((*pte & PTE_V) == 0)
      panic("uvmcopy: page not present");
    pa = PTE2PA(*pte);
    *pte = (*pte & ~PTE_W) | PTE_C;
    flags = PTE_FLAGS(*pte);
    if(mappages(new, i, PGSIZE,pa, flags) != 0){
      goto err;
    }
    add_rc(pa);// Instead of allocating pages, increase the reference count
  }
  return 0;

 err:
  uvmunmap(new, 0, i / PGSIZE, 1);
  return -1;
}

Then when dereferencing, you need to subtract a reference count. We need to pay attention to this part. This reference is logically located at the physical address, so just change the do_free related part in uvmunmap, that is, modify the kfree function ( in kernel/kalloc.c)

The original code:

void
kfree(void *pa)
{
  struct run *r;

  if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP)
    panic("kfree");

  // Fill with junk to catch dangling refs.
  memset(pa, 1, PGSIZE);

  r = (struct run*)pa;

  acquire(&kmem.lock);
  r->next = kmem.freelist;
  kmem.freelist = r;
  release(&kmem.lock);
}

Change it to the following, increase it only if the reference count is processed, otherwise subtract the reference count once:

void
kfree(void *pa)
{
  struct run *r;

  if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP)
    panic("kfree");
  if(reference_count[idx_rc((uint64)pa)] > 1){// > 1 just sub 1
      sub_rc((uint64)pa);
      return;
  }
  // Fill with junk to catch dangling refs.
  memset(pa, 1, PGSIZE);

  r = (struct run*)pa;

  acquire(&kmem.lock);
  r->next = kmem.freelist;
  kmem.freelist = r;
  release(&kmem.lock);
  reference_count[idx_rc((uint64)pa)] = 0;
}

Modify the usertrap function below (in kernel/trap.c)

The original code:

void
usertrap(void)
{
  int which_dev = 0;

  if((r_sstatus() & SSTATUS_SPP) != 0)
    panic("usertrap: not from user mode");

  // send interrupts and exceptions to kerneltrap(),
  // since we're now in the kernel.
  w_stvec((uint64)kernelvec);

  struct proc *p = myproc();

  // save user program counter.
  p->trapframe->epc = r_sepc();

  if(r_scause() == 8){
    // system call

    if(p->killed)
      exit(-1);

    // sepc points to the ecall instruction,
    // but we want to return to the next instruction.
    p->trapframe->epc += 4;

    // an interrupt will change sstatus &c registers,
    // so don't enable until done with those registers.
    intr_on();

    syscall();
  } else if((which_dev = devintr()) != 0){
    // ok
  } else {
    printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
    printf("            sepc=%p stval=%p\n", r_sepc(), r_stval());
    p->killed = 1;
  }

  if(p->killed)
    exit(-1);

  // give up the CPU if this is a timer interrupt.
  if(which_dev == 2)
    yield();

  usertrapret();
}

Now need to increase the processing when the scause is 13 or 15

The changed code is:

void
usertrap(void)
{
  int which_dev = 0;

  if((r_sstatus() & SSTATUS_SPP) != 0)
    panic("usertrap: not from user mode");

  // send interrupts and exceptions to kerneltrap(),
  // since we're now in the kernel.
  w_stvec((uint64)kernelvec);

  struct proc *p = myproc();

  // save user program counter.
  p->trapframe->epc = r_sepc();

  if(r_scause() == 8){
    // system call

    if(p->killed)
      exit(-1);

    // sepc points to the ecall instruction,
    // but we want to return to the next instruction.
    p->trapframe->epc += 4;

    // an interrupt will change sstatus &c registers,
    // so don't enable until done with those registers.
    intr_on();

    syscall();
  } else if((which_dev = devintr()) != 0){
// ok
} else {
      uint64 va = PGROUNDDOWN(r_stval());
      if(va >= MAXVA){
          p->killed = 1;
          exit(-1);
      }
      pte_t* pte = walk(p->pagetable,va,0);
      if(pte == 0){
          p->killed = 1;
          exit(-1);
      }
      uint64 pa = PTE2PA(*pte);
      uint64 flag = PTE_FLAGS(*pte);
      if((r_scause() == 13 || r_scause() == 15) && (flag & PTE_C) ){
          char* mem = kalloc();
          if(mem == 0){
              printf("here\n");
              p->killed = 1;
              exit(-1);
          }
          memmove(mem,(char*)pa,PGSIZE);
          kfree((char*)pa);
          *pte = PA2PTE(mem) | (flag & ~PTE_C) | PTE_W;
      }
      else{
          printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
          printf("            sepc=%p stval=%p\n", r_sepc(), r_stval());
          p->killed = 1;
      }
  }

  if(p->killed)
    exit(-1);

  // give up the CPU if this is a timer interrupt.
  if(which_dev == 2)
    yield();

  usertrapret();
}

This part of the content is to allocate pages, modify the flag bits, and call kfree to process, reducing the reference count once.

Then modify the copyout function of kernel/vm.c.

The original code is:

int
copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len)
{
  uint64 n, va0, pa0;

  while(len > 0){
    va0 = PGROUNDDOWN(dstva);
    pa0 = walkaddr(pagetable, va0);
    if(pa0 == 0)
      return -1;
    n = PGSIZE - (dstva - va0);
    if(n > len)
      n = len;
    memmove((void *)(pa0 + (dstva - va0)), src, n);

    len -= n;
    src += n;
    dstva = va0 + PGSIZE;
  }
  return 0;
}

Now change to:

int
copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len)
{
  uint64 n, va0, pa0, flag;
  pte_t* pte;
  while(len > 0){
    va0 = PGROUNDDOWN(dstva);
    if(va0 >= MAXVA)
        return -1;
    pte = walk(pagetable,va0,0);
    if(pte == 0)
        return -1;
    pa0 = PTE2PA(*pte);
    flag = PTE_FLAGS(*pte);
    if(flag & PTE_C){
        char* mem = kalloc();
        if(mem == 0){
            return -1;
        }
        memmove(mem,(char*)pa0,PGSIZE);
        kfree((char*)pa0);
        *pte = PA2PTE((uint64)mem) | (flag & ~PTE_C) | PTE_W;
        pa0 = (uint64)mem;
    }
    if(pa0 == 0)
      return -1;
    n = PGSIZE - (dstva - va0);
    if(n > len)
      n = len;
    memmove((void *)(pa0 + (dstva - va0)), src, n);

    len -= n;
    src += n;
    dstva = va0 + PGSIZE;
  }
  return 0;
}

Don't forget to change the kalloc function, located in (kernel/kalloc.c)

Originally:

void *
kalloc(void)
{
  struct run *r;

  acquire(&kmem.lock);
  r = kmem.freelist;
  if(r)
    kmem.freelist = r->next;
  release(&kmem.lock);

  if(r)
    memset((char*)r, 5, PGSIZE); // fill with junk
  return (void*)r;
}

It is now necessary to initialize the reference count (set to 1) when allocating pages

That is:

void *
kalloc(void)
{
  struct run *r;

  acquire(&kmem.lock);
  r = kmem.freelist;
  if(r)
    kmem.freelist = r->next;
  release(&kmem.lock);

  if(r)
    memset((char*)r, 5, PGSIZE); // fill with junk

  if(r)
    reference_count[idx_rc((uint64)r)] = 1;
  return (void*)r;
}

Finally add the declaration in kernel/defs.h:

// kalloc.c
void            add_rc(uint64);
void            sub_rc(uint64);

// vm.c
pte_t*          walk(pagetable_t, uint64,int);

Here are the final experimental results:

references:

https://github.com/jlu-xiurui/MIT6.S081-2021-FALL/tree/master/lab6-cow

Tags: C Linux system architecture

Posted by dotsc on Wed, 25 Jan 2023 03:02:02 +0530