plan9fox/sys/src/9/pc/ethervirtio.c
cinap_lenrek f58d99aa7a virtio: add non-legacy virtio 1.0 drivers for disk and ethernet
The new interface uses pci capability structures to locate the
registers in a rather fine granular way making it more complicated
as they can be located anywhere in any pci bar at any offset.

As far as i can see, qemu (6.0.50) never uses i/o bars in
non-legacy mode, so only mmio is implemented for now.

The previous virtio drivers implemented the legacy interface only
which uses i/o ports for all register accesses. This is still
the preferred method (and also qemu default) as it is easier to
emulate and most likely faster.

However, some vps providers like vultr force the legacy interface
to disabled with qemu -device option "disable-legacy=on" resulting
on a system without a disk and ethernet.
2021-07-11 11:24:13 +00:00

692 lines
12 KiB
C

/*
* virtio ethernet driver implementing the legacy interface:
* http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
*/
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/pci.h"
#include "../port/error.h"
#include "../port/netif.h"
#include "../port/etherif.h"
typedef struct Vring Vring;
typedef struct Vdesc Vdesc;
typedef struct Vused Vused;
typedef struct Vheader Vheader;
typedef struct Vqueue Vqueue;
typedef struct Ctlr Ctlr;
enum {
/* §2.1 Device Status Field */
Sacknowledge = 1,
Sdriver = 2,
Sdriverok = 4,
Sfeatureok = 8,
Sfailed = 128,
/* §4.1.4.8 Legacy Interfaces: A Note on PCI Device Layout */
Qdevfeat = 0,
Qdrvfeat = 4,
Qaddr = 8,
Qsize = 12,
Qselect = 14,
Qnotify = 16,
Qstatus = 18,
Qisr = 19,
Qmac = 20,
Qnetstatus = 26,
/* flags in Qnetstatus */
Nlinkup = (1<<0),
Nannounce = (1<<1),
/* feature bits */
Fmac = (1<<5),
Fstatus = (1<<16),
Fctrlvq = (1<<17),
Fctrlrx = (1<<18),
/* vring used flags */
Unonotify = 1,
/* vring avail flags */
Rnointerrupt = 1,
/* descriptor flags */
Dnext = 1,
Dwrite = 2,
Dindirect = 4,
/* struct sizes */
VringSize = 4,
VdescSize = 16,
VusedSize = 8,
VheaderSize = 10,
/* §4.1.5.1.4.1 says pages are 4096 bytes
* for the purposes of the driver.
*/
VBY2PG = 4096,
#define VPGROUND(s) ROUND(s, VBY2PG)
Vrxq = 0,
Vtxq = 1,
Vctlq = 2,
/* class/cmd for Vctlq */
CtrlRx = 0x00,
CmdPromisc = 0x00,
CmdAllmulti = 0x01,
CtrlMac = 0x01,
CmdMacTableSet = 0x00,
CtrlVlan= 0x02,
CmdVlanAdd = 0x00,
CmdVlanDel = 0x01,
};
struct Vring
{
u16int flags;
u16int idx;
};
struct Vdesc
{
u64int addr;
u32int len;
u16int flags;
u16int next;
};
struct Vused
{
u32int id;
u32int len;
};
struct Vheader
{
u8int flags;
u8int segtype;
u16int hlen;
u16int seglen;
u16int csumstart;
u16int csumend;
};
/* §2.4 Virtqueues */
struct Vqueue
{
Rendez;
uint qsize;
uint qmask;
Vdesc *desc;
Vring *avail;
u16int *availent;
u16int *availevent;
Vring *used;
Vused *usedent;
u16int *usedevent;
u16int lastused;
uint nintr;
uint nnote;
};
struct Ctlr {
Lock;
QLock ctllock;
int attached;
int port;
Pcidev *pcidev;
Ctlr *next;
int active;
int id;
int typ;
ulong feat;
int nqueue;
/* virtioether has 3 queues: rx, tx and ctl */
Vqueue queue[3];
};
static Ctlr *ctlrhead;
static int
vhasroom(void *v)
{
Vqueue *q = v;
return q->lastused != q->used->idx;
}
static void
vqnotify(Ctlr *ctlr, int x)
{
Vqueue *q;
coherence();
q = &ctlr->queue[x];
if(q->used->flags & Unonotify)
return;
q->nnote++;
outs(ctlr->port+Qnotify, x);
}
static void
txproc(void *v)
{
Vheader *header;
Block **blocks;
Ether *edev;
Ctlr *ctlr;
Vqueue *q;
Vused *u;
Block *b;
int i, j;
edev = v;
ctlr = edev->ctlr;
q = &ctlr->queue[Vtxq];
header = smalloc(VheaderSize);
blocks = smalloc(sizeof(Block*) * (q->qsize/2));
for(i = 0; i < q->qsize/2; i++){
j = i << 1;
q->desc[j].addr = PADDR(header);
q->desc[j].len = VheaderSize;
q->desc[j].next = j | 1;
q->desc[j].flags = Dnext;
q->availent[i] = q->availent[i + q->qsize/2] = j;
j |= 1;
q->desc[j].next = 0;
q->desc[j].flags = 0;
}
q->avail->flags &= ~Rnointerrupt;
while(waserror())
;
while((b = qbread(edev->oq, 1000000)) != nil){
for(;;){
/* retire completed packets */
while((i = q->lastused) != q->used->idx){
u = &q->usedent[i & q->qmask];
i = (u->id & q->qmask) >> 1;
if(blocks[i] == nil)
break;
freeb(blocks[i]);
blocks[i] = nil;
q->lastused++;
}
/* have free slot? */
i = q->avail->idx & (q->qmask >> 1);
if(blocks[i] == nil)
break;
/* ring full, wait and retry */
if(!vhasroom(q))
sleep(q, vhasroom, q);
}
/* slot is free, fill in descriptor */
blocks[i] = b;
j = (i << 1) | 1;
q->desc[j].addr = PADDR(b->rp);
q->desc[j].len = BLEN(b);
coherence();
q->avail->idx++;
vqnotify(ctlr, Vtxq);
}
pexit("ether out queue closed", 1);
}
static void
rxproc(void *v)
{
Vheader *header;
Block **blocks;
Ether *edev;
Ctlr *ctlr;
Vqueue *q;
Vused *u;
Block *b;
int i, j;
edev = v;
ctlr = edev->ctlr;
q = &ctlr->queue[Vrxq];
header = smalloc(VheaderSize);
blocks = smalloc(sizeof(Block*) * (q->qsize/2));
for(i = 0; i < q->qsize/2; i++){
j = i << 1;
q->desc[j].addr = PADDR(header);
q->desc[j].len = VheaderSize;
q->desc[j].next = j | 1;
q->desc[j].flags = Dwrite|Dnext;
q->availent[i] = q->availent[i + q->qsize/2] = j;
j |= 1;
q->desc[j].next = 0;
q->desc[j].flags = Dwrite;
}
q->avail->flags &= ~Rnointerrupt;
while(waserror())
;
for(;;){
/* replenish receive ring */
do {
i = q->avail->idx & (q->qmask >> 1);
if(blocks[i] != nil)
break;
if((b = iallocb(ETHERMAXTU)) == nil)
break;
blocks[i] = b;
j = (i << 1) | 1;
q->desc[j].addr = PADDR(b->rp);
q->desc[j].len = BALLOC(b);
coherence();
q->avail->idx++;
} while(q->avail->idx != q->used->idx);
vqnotify(ctlr, Vrxq);
/* wait for any packets to complete */
if(!vhasroom(q))
sleep(q, vhasroom, q);
/* retire completed packets */
while((i = q->lastused) != q->used->idx) {
u = &q->usedent[i & q->qmask];
i = (u->id & q->qmask) >> 1;
if((b = blocks[i]) == nil)
break;
blocks[i] = nil;
b->wp = b->rp + u->len - VheaderSize;
etheriq(edev, b);
q->lastused++;
}
}
}
static int
vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
{
uchar hdr[2], ack[1];
Ctlr *ctlr;
Vqueue *q;
Vdesc *d;
int i;
ctlr = edev->ctlr;
q = &ctlr->queue[Vctlq];
if(q->qsize < 3)
return -1;
qlock(&ctlr->ctllock);
while(waserror())
;
ack[0] = 0x55;
hdr[0] = class;
hdr[1] = cmd;
d = &q->desc[0];
d->addr = PADDR(hdr);
d->len = sizeof(hdr);
d->next = 1;
d->flags = Dnext;
d++;
d->addr = PADDR(data);
d->len = ndata;
d->next = 2;
d->flags = Dnext;
d++;
d->addr = PADDR(ack);
d->len = sizeof(ack);
d->next = 0;
d->flags = Dwrite;
i = q->avail->idx & q->qmask;
q->availent[i] = 0;
coherence();
q->avail->flags &= ~Rnointerrupt;
q->avail->idx++;
vqnotify(ctlr, Vctlq);
while(!vhasroom(q))
sleep(q, vhasroom, q);
q->lastused = q->used->idx;
q->avail->flags |= Rnointerrupt;
qunlock(&ctlr->ctllock);
poperror();
if(ack[0] != 0)
print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);
return ack[0];
}
static void
interrupt(Ureg*, void* arg)
{
Ether *edev;
Ctlr *ctlr;
Vqueue *q;
int i;
edev = arg;
ctlr = edev->ctlr;
if(inb(ctlr->port+Qisr) & 1){
for(i = 0; i < ctlr->nqueue; i++){
q = &ctlr->queue[i];
if(vhasroom(q)){
q->nintr++;
wakeup(q);
}
}
}
}
static void
attach(Ether* edev)
{
char name[KNAMELEN];
Ctlr* ctlr;
ctlr = edev->ctlr;
lock(ctlr);
if(ctlr->attached){
unlock(ctlr);
return;
}
ctlr->attached = 1;
unlock(ctlr);
/* ready to go */
outb(ctlr->port+Qstatus, inb(ctlr->port+Qstatus) | Sdriverok);
/* start kprocs */
snprint(name, sizeof name, "#l%drx", edev->ctlrno);
kproc(name, rxproc, edev);
snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
kproc(name, txproc, edev);
}
static long
ifstat(Ether *edev, void *a, long n, ulong offset)
{
int i, l;
char *p;
Ctlr *ctlr;
Vqueue *q;
ctlr = edev->ctlr;
p = smalloc(READSTR);
l = snprint(p, READSTR, "devfeat %32.32lub\n", ctlr->feat);
l += snprint(p+l, READSTR-l, "drvfeat %32.32lub\n", inl(ctlr->port+Qdrvfeat));
l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", inb(ctlr->port+Qstatus));
if(ctlr->feat & Fstatus)
l += snprint(p+l, READSTR-l, "netstatus %8.8ub\n", inb(ctlr->port+Qnetstatus));
for(i = 0; i < ctlr->nqueue; i++){
q = &ctlr->queue[i];
l += snprint(p+l, READSTR-l,
"vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
}
n = readstr(offset, a, n, p);
free(p);
return n;
}
static void
shutdown(Ether* edev)
{
Ctlr *ctlr = edev->ctlr;
outb(ctlr->port+Qstatus, 0);
pciclrbme(ctlr->pcidev);
}
static void
promiscuous(void *arg, int on)
{
Ether *edev = arg;
uchar b[1];
b[0] = on != 0;
vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
}
static void
multicast(void *arg, uchar*, int)
{
Ether *edev = arg;
uchar b[1];
b[0] = edev->nmaddr > 0;
vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
}
/* §2.4.2 Legacy Interfaces: A Note on Virtqueue Layout */
static ulong
queuesize(ulong size)
{
return VPGROUND(VdescSize*size + sizeof(u16int)*(3+size))
+ VPGROUND(sizeof(u16int)*3 + VusedSize*size);
}
static int
initqueue(Vqueue *q, int size)
{
uchar *p;
/* §2.4: Queue Size value is always a power of 2 and <= 32768 */
assert(!(size & (size - 1)) && size <= 32768);
p = mallocalign(queuesize(size), VBY2PG, 0, 0);
if(p == nil){
print("ethervirtio: no memory for Vqueue\n");
free(p);
return -1;
}
q->desc = (void*)p;
p += VdescSize*size;
q->avail = (void*)p;
p += VringSize;
q->availent = (void*)p;
p += sizeof(u16int)*size;
q->availevent = (void*)p;
p += sizeof(u16int);
p = (uchar*)VPGROUND((uintptr)p);
q->used = (void*)p;
p += VringSize;
q->usedent = (void*)p;
p += VusedSize*size;
q->usedevent = (void*)p;
q->qsize = size;
q->qmask = q->qsize - 1;
q->lastused = q->avail->idx = q->used->idx = 0;
q->avail->flags |= Rnointerrupt;
return 0;
}
static Ctlr*
pciprobe(int typ)
{
Ctlr *c, *h, *t;
Pcidev *p;
int n, i;
h = t = nil;
/* §4.1.2 PCI Device Discovery */
for(p = nil; p = pcimatch(p, 0x1AF4, 0);){
/* the two possible DIDs for virtio-net */
if(p->did != 0x1000 && p->did != 0x1041)
continue;
/*
* non-transitional devices will have a revision > 0,
* these are handled by ethervirtio10 driver.
*/
if(p->rid != 0)
continue;
/* first membar needs to be I/O */
if((p->mem[0].bar & 1) == 0)
continue;
/* non-transitional device will have typ+0x40 */
if(pcicfgr16(p, 0x2E) != typ)
continue;
if((c = mallocz(sizeof(Ctlr), 1)) == nil){
print("ethervirtio: no memory for Ctlr\n");
break;
}
c->port = p->mem[0].bar & ~3;
if(ioalloc(c->port, p->mem[0].size, 0, "ethervirtio") < 0){
print("ethervirtio: port %ux in use\n", c->port);
free(c);
continue;
}
c->typ = typ;
c->pcidev = p;
pcienable(p);
c->id = (p->did<<16)|p->vid;
/* §3.1.2 Legacy Device Initialization */
outb(c->port+Qstatus, 0);
while(inb(c->port+Qstatus) != 0)
delay(1);
outb(c->port+Qstatus, Sacknowledge|Sdriver);
/* negotiate feature bits */
c->feat = inl(c->port+Qdevfeat);
outl(c->port+Qdrvfeat, c->feat & (Fmac|Fstatus|Fctrlvq|Fctrlrx));
/* §4.1.5.1.4 Virtqueue Configuration */
for(i=0; i<nelem(c->queue); i++){
outs(c->port+Qselect, i);
n = ins(c->port+Qsize);
if(n == 0 || (n & (n-1)) != 0){
if(i < 2)
print("ethervirtio: queue %d has invalid size %d\n", i, n);
break;
}
if(initqueue(&c->queue[i], n) < 0)
break;
coherence();
outl(c->port+Qaddr, PADDR(c->queue[i].desc)/VBY2PG);
}
if(i < 2){
print("ethervirtio: no queues\n");
pcidisable(p);
free(c);
continue;
}
c->nqueue = i;
if(h == nil)
h = c;
else
t->next = c;
t = c;
}
return h;
}
static int
reset(Ether* edev)
{
static uchar zeros[Eaddrlen];
Ctlr *ctlr;
int i;
if(ctlrhead == nil)
ctlrhead = pciprobe(1);
for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
if(ctlr->active)
continue;
if(edev->port == 0 || edev->port == ctlr->port){
ctlr->active = 1;
break;
}
}
if(ctlr == nil)
return -1;
edev->ctlr = ctlr;
edev->port = ctlr->port;
edev->irq = ctlr->pcidev->intl;
edev->tbdf = ctlr->pcidev->tbdf;
edev->mbps = 1000;
edev->link = 1;
if((ctlr->feat & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
for(i = 0; i < Eaddrlen; i++)
edev->ea[i] = inb(ctlr->port+Qmac+i);
} else {
for(i = 0; i < Eaddrlen; i++)
outb(ctlr->port+Qmac+i, edev->ea[i]);
}
edev->arg = edev;
edev->attach = attach;
edev->shutdown = shutdown;
edev->ifstat = ifstat;
if((ctlr->feat & (Fctrlvq|Fctrlrx)) == (Fctrlvq|Fctrlrx)){
edev->multicast = multicast;
edev->promiscuous = promiscuous;
}
pcisetbme(ctlr->pcidev);
intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);
return 0;
}
void
ethervirtiolink(void)
{
addethercard("virtio", reset);
}