sdnvme: NVMe controller driver (work in progress)

basic NVMe controller driver, reads and writes work.
"namespaces" show up as logical units.
uses pin/msi interrupts (no msi-x support yet).
one submission queue per cpu, shared completion queue.
no recovery from fatal controller errors.
only tested in qemu (no hardware available).

commiting this so it can be found by someone who has
hardware.
This commit is contained in:
cinap_lenrek 2017-03-29 00:21:35 +02:00
parent 83dd98022d
commit bfae9e08be

663
sys/src/9/pc/sdnvme.c Normal file
View file

@ -0,0 +1,663 @@
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "ureg.h"
#include "../port/error.h"
#include "../port/sd.h"
typedef struct WS WS;
typedef struct CQ CQ;
typedef struct SQ SQ;
typedef struct Ctlr Ctlr;
struct WS
{
u32int cdw0;
ushort status;
Rendez *sleep;
WS **link;
SQ *queue;
};
struct CQ
{
u32int head;
u32int mask;
u32int shift;
u32int *base;
Ctlr *ctlr;
};
struct SQ
{
u32int tail;
u32int mask;
u32int shift;
u32int *base;
WS **wait;
Ctlr *ctlr;
};
struct Ctlr
{
QLock;
Lock intr;
u32int ints;
u32int irqc[2];
Pcidev *pci;
u32int *reg;
u64int cap;
uchar *ident;
u32int *nsid;
int nnsid;
u32int mps; /* mps = 1<<mpsshift */
u32int mpsshift;
u32int dstrd;
CQ cq[1+1];
SQ sq[1+MAXMACH];
Ctlr *next;
};
/* controller registers */
enum {
Cap0,
Cap1,
Ver,
IntMs,
IntMc,
CCfg,
CSts = 0x1C/4,
Nssr,
AQAttr,
ASQBase0,
ASQBase1,
ACQBase0,
ACQBase1,
DBell = 0x1000/4,
};
static u32int*
qcmd(WS *ws, Ctlr *ctlr, int adm, u32int opc, u32int nsid, void *mptr, void *data, ulong len)
{
u32int cid, *e;
u64int pa;
SQ *sq;
if(!adm){
Retry:
splhi();
sq = &ctlr->sq[1+m->machno];
} else {
qlock(ctlr);
sq = &ctlr->sq[0];
}
ws->sleep = &up->sleep;
ws->queue = sq;
ws->link = &sq->wait[sq->tail & sq->mask];
while(*ws->link != nil){
sched();
if(!adm){
/* should be very rare */
goto Retry;
}
}
*ws->link = ws;
e = &sq->base[((cid = sq->tail++) & sq->mask)<<4];
e[0] = opc | cid<<16;
e[1] = nsid;
e[2] = 0;
e[3] = 0;
if(mptr != nil){
pa = PADDR(mptr);
e[4] = pa;
e[5] = pa>>32;
} else {
e[4] = 0;
e[5] = 0;
}
if(len > 0){
pa = PADDR(data);
e[6] = pa;
e[7] = pa>>32;
if(len > ctlr->mps - (pa & ctlr->mps-1))
pa += ctlr->mps - (pa & ctlr->mps-1);
else
pa = 0;
} else {
e[6] = 0;
e[7] = 0;
pa = 0;
}
e[8] = pa;
e[9] = pa>>32;
return e;
}
static void
nvmeintr(Ureg *, void *arg)
{
u32int phaseshift, *e;
WS *ws, **wp;
Ctlr *ctlr;
SQ *sq;
CQ *cq;
ctlr = arg;
if(ctlr->ints == 0)
return;
ilock(&ctlr->intr);
ctlr->reg[IntMs] = ctlr->ints;
for(cq = &ctlr->cq[nelem(ctlr->cq)-1]; cq >= ctlr->cq; cq--){
if(cq->base == nil)
continue;
phaseshift = 16 - cq->shift;
for(;; cq->head++){
e = &cq->base[(cq->head & cq->mask)<<2];
if(((e[3] ^ (cq->head << phaseshift)) & 0x10000) == 0)
break;
if(0) iprint("nvmeintr: cq%d [%.4ux] %.8ux %.8ux %.8ux %.8ux\n",
(int)(cq - ctlr->cq), cq->head & cq->mask,
e[0], e[1], e[2], e[3]);
sq = &ctlr->sq[e[2] >> 16];
wp = &sq->wait[e[3] & sq->mask];
if((ws = *wp) != nil && ws->link == wp){
Rendez *z = ws->sleep;
ws->cdw0 = e[0];
ws->status = e[3]>>17;
*wp = nil;
wakeup(z);
}
}
ctlr->reg[DBell + ((cq-ctlr->cq)*2+1 << ctlr->dstrd)] = cq->head & cq->mask;
}
if((ctlr->reg[CSts] & 3) != 1)
iprint("nvmeintr: fatal controller error\n");
ctlr->reg[IntMc] = ctlr->ints;
iunlock(&ctlr->intr);
}
static int
wdone(void *arg)
{
WS *ws = arg;
return *ws->link != ws;
}
static u32int
wcmd(WS *ws)
{
SQ *sq = ws->queue;
Ctlr *ctlr = sq->ctlr;
coherence();
ctlr->reg[DBell + ((sq-ctlr->sq)*2+0 << ctlr->dstrd)] = sq->tail & sq->mask;
if(sq > ctlr->sq) {
assert(sq == &ctlr->sq[1+m->machno]);
spllo();
} else
qunlock(sq->ctlr);
while(waserror())
;
tsleep(ws->sleep, wdone, ws, 5);
while(!wdone(ws)){
nvmeintr(nil, ctlr);
tsleep(ws->sleep, wdone, ws, 10);
}
poperror();
return ws->status;
}
void
checkstatus(u32int status, char *info)
{
if(status == 0)
return;
snprint(up->genbuf, sizeof(up->genbuf), "%s: status %ux", info, status);
error(up->genbuf);
}
static long
nvmebio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
{
u32int nsid, s, n, m, *e;
Ctlr *ctlr;
uchar *p;
WS ws;
USED(lun);
ctlr = u->dev->ctlr;
nsid = ctlr->nsid[u->subno];
s = u->secsize;
p = a;
while(count > 0){
m = (2*ctlr->mps - ((uintptr)p & ctlr->mps-1)) / s;
if((n = count) > m)
n = m;
e = qcmd(&ws, ctlr, 0, write ? 0x01 : 0x02, nsid, nil, p, n*s);
e[10] = lba;
e[11] = lba>>32;
e[12] = n-1;
e[13] = (count>n)<<6; /* sequential request */
e[14] = 0;
e[15] = 0;
checkstatus(wcmd(&ws), write ? "write" : "read");
p += n*s;
count -= n;
lba += n;
}
return p - (uchar*)a;
}
static int
nvmerio(SDreq *r)
{
int i, count, rw;
uvlong lba;
SDunit *u;
u = r->unit;
if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91)
return sdsetsense(r, SDok, 0, 0, 0);
if((i = sdfakescsi(r)) != SDnostatus)
return r->status = i;
if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
return i;
r->rlen = nvmebio(u, r->lun, rw == SDwrite, r->data, count, lba);
return r->status = SDok;
}
static int
nvmeverify(SDunit *u)
{
Ctlr *ctlr = u->dev->ctlr;
return u->subno < ctlr->nnsid;
}
static int
nvmeonline(SDunit *u)
{
u32int *e, lbaf;
uchar *info, *p;
Ctlr *ctlr;
WS ws;
if(u->sectors != 0)
return 1;
ctlr = u->dev->ctlr;
if((info = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
return 0;
e = qcmd(&ws, ctlr, 1, 0x06, ctlr->nsid[u->subno], nil, info, 0x1000);
e[10] = 0; // identify namespace
if(wcmd(&ws) != 0){
free(info);
return 0;
}
p = info;
u->sectors = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24
| (u64int)p[4]<<32
| (u64int)p[5]<<40
| (u64int)p[6]<<48
| (u64int)p[7]<<56;
p = &info[128 + 4*(info[26]&15)];
lbaf = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24;
u->secsize = 1<<((lbaf>>16)&0xFF);
free(info);
memset(u->inquiry, 0, sizeof u->inquiry);
u->inquiry[2] = 2;
u->inquiry[3] = 2;
u->inquiry[4] = sizeof u->inquiry - 4;
memmove(u->inquiry+8, ctlr->ident+24, 20);
return 2;
}
static int
nvmerctl(SDunit *u, char *p, int l)
{
Ctlr *ctlr;
char *e, *s;
if((ctlr = u->dev->ctlr) == nil || ctlr->ident == nil)
return 0;
e = p+l;
s = p;
p = seprint(p, e, "model\t%.20s\n", (char*)ctlr->ident+24);
p = seprint(p, e, "serial\t%.10s\n", (char*)ctlr->ident+4);
p = seprint(p, e, "firm\t%.6s\n", (char*)ctlr->ident+64);
p = seprint(p, e, "geometry %llud %lud\n", u->sectors, u->secsize);
return p-s;
}
static void*
cqalloc(Ctlr *ctlr, CQ *cq, u32int lgsize)
{
cq->ctlr = ctlr;
cq->head = 0;
cq->shift = lgsize-4;
cq->mask = (1<<cq->shift)-1;
if((cq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
error(Enomem);
memset(cq->base, 0, 1<<lgsize);
return cq->base;
}
static void*
sqalloc(Ctlr *ctlr, SQ *sq, u32int lgsize)
{
sq->ctlr = ctlr;
sq->tail = 0;
sq->shift = lgsize-6;
sq->mask = (1<<sq->shift)-1;
if((sq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
error(Enomem);
if((sq->wait = mallocz(sizeof(WS*)*(sq->mask+1), 1)) == nil)
error(Enomem);
memset(sq->base, 0, 1<<lgsize);
return sq->base;
}
static void
setupqueues(Ctlr *ctlr)
{
u32int lgsize, *e;
CQ *cq;
SQ *sq;
WS ws;
int i;
/* Overkill */
lgsize = 12-6+4;
while(lgsize < 16+4 && lgsize < ctlr->mpsshift && 1<<lgsize < conf.nmach<<12-6+4)
lgsize++;
/* CQID1: shared completion queue */
cq = &ctlr->cq[1];
cqalloc(ctlr, cq, lgsize);
e = qcmd(&ws, ctlr, 1, 0x05, ~0, nil, cq->base, 1<<lgsize);
e[10] = (cq - ctlr->cq) | cq->mask<<16;
e[11] = 3; /* IEN | PC */
checkstatus(wcmd(&ws), "create completion queue");
/* SQID[1..nmach]: submission queue per cpu */
for(i=1; i<=conf.nmach; i++){
sq = &ctlr->sq[i];
sqalloc(ctlr, sq, 12);
e = qcmd(&ws, ctlr, 1, 0x01, ~0, nil, sq->base, 0x1000);
e[10] = i | sq->mask<<16;
e[11] = (cq - ctlr->cq)<<16 | 1; /* CQID<<16 | PC */
checkstatus(wcmd(&ws), "create submission queue");
}
ilock(&ctlr->intr);
ctlr->ints |= 1<<(cq - ctlr->cq);
ctlr->reg[IntMc] = ctlr->ints;
iunlock(&ctlr->intr);
}
static void
identify(Ctlr *ctlr)
{
u32int *e;
WS ws;
if(ctlr->ident == nil)
if((ctlr->ident = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
error(Enomem);
if(ctlr->nsid == nil)
if((ctlr->nsid = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
error(Enomem);
e = qcmd(&ws, ctlr, 1, 0x06, ~0, nil, ctlr->ident, 0x1000);
e[10] = 1; // identify controller
checkstatus(wcmd(&ws), "identify controller");
e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->nsid, 0x1000);
e[10] = 2; // namespace list
checkstatus(wcmd(&ws), "namespace list");
ctlr->nnsid = 0;
while(ctlr->nnsid < 1024 && ctlr->nsid[ctlr->nnsid] != 0)
ctlr->nnsid++;
}
static int
nvmedisable(SDev *sd)
{
char name[32];
Ctlr *ctlr;
int i;
ctlr = sd->ctlr;
/* mask interrupts */
ilock(&ctlr->intr);
ctlr->ints = 0;
ctlr->reg[IntMs] = ~ctlr->ints;
iunlock(&ctlr->intr);
/* disable controller */
ctlr->reg[CCfg] = 0;
for(i = 0; i < 10; i++){
if((ctlr->reg[CSts] & 1) == 0)
break;
tsleep(&up->sleep, return0, nil, 100);
}
snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
intrdisable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
pciclrbme(ctlr->pci); /* dma disable */
for(i=0; i<nelem(ctlr->sq); i++){
free(ctlr->sq[i].base);
free(ctlr->sq[i].wait);
}
for(i=0; i<nelem(ctlr->cq); i++)
free(ctlr->cq[i].base);
memset(ctlr->sq, 0, sizeof(ctlr->sq));
memset(ctlr->cq, 0, sizeof(ctlr->cq));
free(ctlr->ident);
ctlr->ident = nil;
free(ctlr->nsid);
ctlr->nsid = nil;
ctlr->nnsid = 0;
return 1;
}
static int
nvmeenable(SDev *sd)
{
char name[32];
Ctlr *ctlr;
u64int pa;
int to;
ctlr = sd->ctlr;
snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
intrenable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
if(waserror()){
print("%s: %s\n", name, up->errstr);
nvmedisable(sd);
sd->nunit = 0; /* hack: prevent further probing */
return 0;
}
pa = PADDR(cqalloc(ctlr, &ctlr->cq[0], ctlr->mpsshift));
ctlr->reg[ACQBase0] = pa;
ctlr->reg[ACQBase1] = pa>>32;
pa = PADDR(sqalloc(ctlr, &ctlr->sq[0], ctlr->mpsshift));
ctlr->reg[ASQBase0] = pa;
ctlr->reg[ASQBase1] = pa>>32;
ctlr->reg[AQAttr] = ctlr->sq[0].mask | ctlr->cq[0].mask<<16;
/* dma enable */
pcisetbme(ctlr->pci);
/* enable interrupt */
ilock(&ctlr->intr);
ctlr->ints = 1;
ctlr->reg[IntMc] = ctlr->ints;
iunlock(&ctlr->intr);
/* enable controller */
ctlr->reg[CCfg] = 1 | (ctlr->mpsshift-12)<<7 | 6<<16 | 4<<20;
for(to = (ctlr->cap>>24) & 255; to >= 0; to--){
tsleep(&up->sleep, return0, nil, 500);
if((ctlr->reg[CSts] & 3) == 1)
goto Ready;
}
if(ctlr->reg[CSts] & 2)
error("fatal controller status during initialization");
error("controller initialization timeout");
Ready:
identify(ctlr);
setupqueues(ctlr);
poperror();
return 1;
}
static Ctlr*
nvmepnpctlrs(void)
{
Ctlr *ctlr, *h, *t;
Pcidev *p;
int i;
h = t = nil;
for(p = nil; p = pcimatch(p, 0, 0);){
if(p->ccrb != 1 || p->ccru != 8 || p->ccrp != 2)
continue;
if(p->mem[0].size == 0)
continue;
if((ctlr = malloc(sizeof(*ctlr))) == nil){
print("nvme: no memory for Ctlr\n");
break;
}
ctlr->pci = p;
ctlr->reg = vmap(p->mem[0].bar & ~0xF, p->mem[0].size);
if(ctlr->reg == nil){
print("nvme: can't vmap bar0\n");
Bad:
if(ctlr->reg != nil)
vunmap(ctlr->reg, p->mem[0].size);
free(ctlr);
continue;
}
ctlr->cap = ctlr->reg[Cap0];
ctlr->cap |= (u64int)ctlr->reg[Cap1]<<32;
/* mask interrupts */
ctlr->ints = 0;
ctlr->reg[IntMs] = ~ctlr->ints;
/* disable controller */
ctlr->reg[CCfg] = 0;
if((ctlr->cap&(1ULL<<37)) == 0){
print("nvme: doesnt support NVM commactlr set: %ux\n",
(u32int)(ctlr->cap>>37) & 0xFF);
goto Bad;
}
/* use 64K page size when possible */
ctlr->dstrd = (ctlr->cap >> 32) & 15;
for(i = (ctlr->cap >> 48) & 15; i < ((ctlr->cap >> 52) & 15); i++){
if(i >= 16-12) /* 64K */
break;
}
ctlr->mpsshift = i+12;
ctlr->mps = 1 << ctlr->mpsshift;
if(h == nil)
h = ctlr;
else
t->next = ctlr;
t = ctlr;
}
return h;
}
SDifc sdnvmeifc;
static SDev*
nvmepnp(void)
{
SDev *s, *h, *t;
Ctlr *ctlr;
int id;
h = t = nil;
id = 'N';
for(ctlr = nvmepnpctlrs(); ctlr != nil; ctlr = ctlr->next){
if((s = malloc(sizeof(*s))) == nil)
break;
s->ctlr = ctlr;
s->idno = id++;
s->ifc = &sdnvmeifc;
s->nunit = 1024;
if(h)
t->next = s;
else
h = s;
t = s;
}
return h;
}
SDifc sdnvmeifc = {
"nvme", /* name */
nvmepnp, /* pnp */
nil, /* legacy */
nvmeenable, /* enable */
nvmedisable, /* disable */
nvmeverify, /* verify */
nvmeonline, /* online */
nvmerio, /* rio */
nvmerctl, /* rctl */
nil, /* wctl */
nvmebio, /* bio */
nil, /* probe */
nil, /* clear */
nil, /* rtopctl */
nil, /* wtopctl */
};