kernel: avoid inconsistent reads in /proc/#/fd and /proc/#/ns

to allow bytewise access to /proc/#/fd, the contents of the file where
recreated on each call. if fd's had been closed or reassigned between
the reads, the offset would be inconsistent and a read could start off
in the middle of a line. this happens when you cat /proc/#/fd file of
a busy process that mutates its filedescriptor table.

to fix this, we now return one line record at a time. if the line
fits in the read size, then this means the next read will always start
at the beginning of the next line record. we remember the consumed
byte count in Chan.mrock and the current record in Chan.nrock. (these
fields are free to usefor non-directory files)

if a read comes in and the offset is the same as c->mrock, we do not
need to regenerate the file and just render the next c->nrock's record.

for reads smaller than the line count, we have to regenerate the content
up to the offset and the race is still possible, but this should not
be the common case.

the same algorithm is now used for /proc/#/ns file, allowing a simpler
reimplementation and getting rid of Mntwalk state strcture.
This commit is contained in:
cinap_lenrek 2014-12-21 04:46:22 +01:00
parent 8ac28ac11c
commit cb35d1a132
4 changed files with 170 additions and 231 deletions

View file

@ -109,7 +109,7 @@ dupread(Chan *c, void *va, long n, vlong offset)
fd = twicefd/2;
if(twicefd & 1){
c = fdtochan(fd, -1, 0, 1);
procfdprint(c, fd, 0, buf, sizeof buf);
procfdprint(c, fd, buf, sizeof buf);
cclose(c);
return readstr((ulong)offset, va, n, buf);
}

View file

@ -155,7 +155,6 @@ void procctlreq(Proc*, char*, int);
int procctlmemio(Proc*, uintptr, int, void*, int);
Chan* proctext(Chan*, Proc*);
int procstopped(void*);
void mntscan(Mntwalk*, Proc*);
ulong procpagecount(Proc *);
static Traceevent *tevents;
@ -400,6 +399,7 @@ procopen(Chan *c, int omode)
case Qkregs:
case Qsegment:
case Qprofile:
case Qns:
case Qfd:
if(omode != OREAD)
error(Eperm);
@ -428,12 +428,6 @@ procopen(Chan *c, int omode)
nonone(p);
break;
case Qns:
if(omode != OREAD)
error(Eperm);
c->aux = smalloc(sizeof(Mntwalk));
break;
case Qnotepg:
nonone(p);
pg = p->pgrp;
@ -514,103 +508,6 @@ procwstat(Chan *c, uchar *db, int n)
return n;
}
static long
procoffset(long offset, char *va, int *np)
{
if(offset > 0) {
offset -= *np;
if(offset < 0) {
memmove(va, va+*np+offset, -offset);
*np = -offset;
}
else
*np = 0;
}
return offset;
}
static int
procqidwidth(Chan *c)
{
char buf[32];
return sprint(buf, "%lud", c->qid.vers);
}
int
procfdprint(Chan *c, int fd, int w, char *s, int ns)
{
int n;
if(w == 0)
w = procqidwidth(c);
n = snprint(s, ns, "%3d %.2s %C %4ld (%.16llux %*lud %.2ux) %5ld %8lld %s\n",
fd,
&"r w rw"[(c->mode&3)<<1],
devtab[c->type]->dc, c->dev,
c->qid.path, w, c->qid.vers, c->qid.type,
c->iounit, c->offset, c->path->s);
return n;
}
static int
procfds(Proc *p, char *va, int count, long offset)
{
Fgrp *f;
Chan *c;
char buf[256];
int n, i, w, ww;
char *a;
/* print to buf to avoid holding fgrp lock while writing to user space */
if(count > sizeof buf)
count = sizeof buf;
a = buf;
eqlock(&p->debug);
f = p->fgrp;
if(f == nil || p->dot == nil){
qunlock(&p->debug);
return 0;
}
lock(f);
if(waserror()){
unlock(f);
qunlock(&p->debug);
nexterror();
}
n = readstr(0, a, count, p->dot->path->s);
n += snprint(a+n, count-n, "\n");
offset = procoffset(offset, a, &n);
/* compute width of qid.path */
w = 0;
for(i = 0; i <= f->maxfd; i++) {
c = f->fd[i];
if(c == nil)
continue;
ww = procqidwidth(c);
if(ww > w)
w = ww;
}
for(i = 0; i <= f->maxfd; i++) {
c = f->fd[i];
if(c == nil)
continue;
n += procfdprint(c, i, w, a+n, count-n);
offset = procoffset(offset, a, &n);
}
unlock(f);
qunlock(&p->debug);
poperror();
/* copy result to user space, now that locks are released */
memmove(va, buf, n);
return n;
}
static void
procclose(Chan *c)
{
@ -622,29 +519,6 @@ procclose(Chan *c)
proctrace = nil;
unlock(&tlock);
}
if(QID(c->qid) == Qns && c->aux != nil){
free(c->aux);
c->aux = nil;
}
}
static void
int2flag(int flag, char *s)
{
if(flag == 0){
*s = '\0';
return;
}
*s++ = '-';
if(flag & MAFTER)
*s++ = 'a';
if(flag & MBEFORE)
*s++ = 'b';
if(flag & MCREATE)
*s++ = 'c';
if(flag & MCACHE)
*s++ = 'C';
*s = '\0';
}
static int
@ -690,6 +564,127 @@ prochaswaitq(void *x)
return p->pid != PID(c->qid) || p->waitq != nil;
}
static void
int2flag(int flag, char *s)
{
if(flag == 0){
*s = '\0';
return;
}
*s++ = '-';
if(flag & MAFTER)
*s++ = 'a';
if(flag & MBEFORE)
*s++ = 'b';
if(flag & MCREATE)
*s++ = 'c';
if(flag & MCACHE)
*s++ = 'C';
*s = '\0';
}
static int
readns1(Chan *c, Proc *p, char *buf, int nbuf)
{
Pgrp *pg;
Mount *t, *cm;
Mhead *f, *mh;
ulong minid, bestmid;
char flag[10], *srv;
int i;
pg = p->pgrp;
if(pg == nil || p->dot == nil || p->pid != PID(c->qid))
error(Eprocdied);
bestmid = ~0;
minid = c->nrock;
if(minid == bestmid)
return 0;
rlock(&pg->ns);
mh = nil;
cm = nil;
for(i = 0; i < MNTHASH; i++) {
for(f = pg->mnthash[i]; f != nil; f = f->hash) {
for(t = f->mount; t != nil; t = t->next) {
if(t->mountid >= minid && t->mountid < bestmid) {
bestmid = t->mountid;
cm = t;
mh = f;
}
}
}
}
if(bestmid == ~0) {
c->nrock = bestmid;
i = snprint(buf, nbuf, "cd %s\n", p->dot->path->s);
} else {
c->nrock = bestmid+1;
int2flag(cm->mflag, flag);
if(strcmp(cm->to->path->s, "#M") == 0){
srv = srvname(cm->to->mchan);
i = snprint(buf, nbuf, "mount %s %s %s %s\n", flag,
srv==nil? cm->to->mchan->path->s : srv,
mh->from->path->s, cm->spec? cm->spec : "");
free(srv);
}else{
i = snprint(buf, nbuf, "bind %s %s %s\n", flag,
cm->to->path->s, mh->from->path->s);
}
}
runlock(&pg->ns);
return i;
}
int
procfdprint(Chan *c, int fd, char *s, int ns)
{
return snprint(s, ns, "%3d %.2s %C %4ld (%.16llux %lud %.2ux) %5ld %8lld %s\n",
fd,
&"r w rw"[(c->mode&3)<<1],
devtab[c->type]->dc, c->dev,
c->qid.path, c->qid.vers, c->qid.type,
c->iounit, c->offset, c->path->s);
}
static int
readfd1(Chan *c, Proc *p, char *buf, int nbuf)
{
Fgrp *fg;
int n, i;
fg = p->fgrp;
if(fg == nil || p->dot == nil || p->pid != PID(c->qid))
return 0;
if(c->nrock == 0){
c->nrock = 1;
return snprint(buf, nbuf, "%s\n", p->dot->path->s);
}
lock(fg);
n = 0;
for(;;){
i = c->nrock-1;
if(i < 0 || i > fg->maxfd)
break;
c->nrock++;
if(fg->fd[i] != nil){
n = procfdprint(fg->fd[i], i, buf, nbuf);
break;
}
}
unlock(fg);
return n;
}
/*
* userspace can't pass negative file offset for a
* 64 bit kernel address, so we use 63 bit and sign
@ -706,15 +701,13 @@ off2addr(vlong off)
static long
procread(Chan *c, void *va, long n, vlong off)
{
/* NSEG*32 was too small for worst cases */
char *a, flag[10], *sps, *srv, statbuf[NSEG*64];
int i, j, m, navail, ne, rsize;
char *a, *sps, statbuf[1024];
int i, j, navail, ne, rsize;
long l;
uchar *rptr;
uintptr addr;
ulong offset;
Confmem *cm;
Mntwalk *mw;
Proc *p;
Segment *sg, *s;
Ureg kur;
@ -753,14 +746,16 @@ procread(Chan *c, void *va, long n, vlong off)
switch(QID(c->qid)){
case Qargs:
eqlock(&p->debug);
j = procargs(p, up->genbuf, sizeof up->genbuf);
j = procargs(p, statbuf, sizeof(statbuf));
qunlock(&p->debug);
if(offset >= j)
return 0;
if(offset+n > j)
n = j-offset;
memmove(a, &up->genbuf[offset], n);
statbufread:
memmove(a, statbuf+offset, n);
return n;
case Qsyscall:
eqlock(&p->debug);
if(waserror()){
@ -769,13 +764,12 @@ procread(Chan *c, void *va, long n, vlong off)
}
if(p->pid != PID(c->qid))
error(Eprocdied);
j = 0;
if(p->syscalltrace != nil)
n = readstr(offset, a, n, p->syscalltrace);
else
n = 0;
j = readstr(offset, a, n, p->syscalltrace);
qunlock(&p->debug);
poperror();
return n;
return j;
case Qmem:
addr = off2addr(off);
@ -830,17 +824,15 @@ procread(Chan *c, void *va, long n, vlong off)
if(p->nnote == 0)
n = 0;
else {
m = strlen(p->note[0].msg) + 1;
if(m > n)
m = n;
memmove(va, p->note[0].msg, m-1);
((char*)va)[m-1] = '\0';
p->nnote--;
i = strlen(p->note[0].msg) + 1;
if(i < n)
n = i;
memmove(a, p->note[0].msg, n-1);
a[n-1] = '\0';
if(--p->nnote == 0)
p->notepending = 0;
memmove(p->note, p->note+1, p->nnote*sizeof(Note));
n = m;
}
if(p->nnote == 0)
p->notepending = 0;
poperror();
qunlock(&p->debug);
return n;
@ -905,8 +897,7 @@ procread(Chan *c, void *va, long n, vlong off)
readnum(0, statbuf+j+NUMSIZE*6, NUMSIZE, procpagecount(p)*BY2PG/1024, NUMSIZE);
readnum(0, statbuf+j+NUMSIZE*7, NUMSIZE, p->basepri, NUMSIZE);
readnum(0, statbuf+j+NUMSIZE*8, NUMSIZE, p->priority, NUMSIZE);
memmove(a, statbuf+offset, n);
return n;
goto statbufread;
case Qsegment:
j = 0;
@ -924,10 +915,7 @@ procread(Chan *c, void *va, long n, vlong off)
return 0;
if(offset+n > j)
n = j-offset;
if(n == 0 && offset == 0)
exhausted("segments");
memmove(a, &statbuf[offset], n);
return n;
goto statbufread;
case Qwait:
if(!canqlock(&p->qwaitr))
@ -959,98 +947,57 @@ procread(Chan *c, void *va, long n, vlong off)
qunlock(&p->qwaitr);
poperror();
n = snprint(a, n, "%d %lud %lud %lud %q",
j = snprint(statbuf, sizeof(statbuf), "%d %lud %lud %lud %q",
wq->w.pid,
wq->w.time[TUser], wq->w.time[TSys], wq->w.time[TReal],
wq->w.msg);
free(wq);
return n;
if(j < n)
n = j;
offset = 0;
goto statbufread;
case Qns:
case Qfd:
eqlock(&p->debug);
if(waserror()){
qunlock(&p->debug);
nexterror();
}
if(p->pgrp == nil || p->dot == nil || p->pid != PID(c->qid))
error(Eprocdied);
mw = c->aux;
if(mw->cddone){
qunlock(&p->debug);
poperror();
return 0;
}
mntscan(mw, p);
if(mw->mh == nil){
mw->cddone = 1;
i = snprint(a, n, "cd %s\n", p->dot->path->s);
qunlock(&p->debug);
poperror();
return i;
}
int2flag(mw->cm->mflag, flag);
if(strcmp(mw->cm->to->path->s, "#M") == 0){
srv = srvname(mw->cm->to->mchan);
i = snprint(a, n, "mount %s %s %s %s\n", flag,
srv==nil? mw->cm->to->mchan->path->s : srv,
mw->mh->from->path->s, mw->cm->spec? mw->cm->spec : "");
free(srv);
}else
i = snprint(a, n, "bind %s %s %s\n", flag,
mw->cm->to->path->s, mw->mh->from->path->s);
if(offset == 0 || offset != c->mrock)
c->nrock = c->mrock = 0;
do {
if(QID(c->qid) == Qns)
j = readns1(c, p, statbuf, sizeof(statbuf));
else
j = readfd1(c, p, statbuf, sizeof(statbuf));
if(j == 0)
break;
c->mrock += j;
} while(c->mrock <= offset);
i = c->mrock - offset;
qunlock(&p->debug);
poperror();
return i;
if(i <= 0)
return 0;
if(i < n)
n = i;
offset = j - i;
goto statbufread;
case Qnoteid:
return readnum(offset, va, n, p->noteid, NUMSIZE);
case Qppid:
return readnum(offset, va, n, p->parentpid, NUMSIZE);
case Qfd:
return procfds(p, va, n, offset);
}
error(Egreg);
return 0; /* not reached */
}
void
mntscan(Mntwalk *mw, Proc *p)
{
Pgrp *pg;
Mount *t;
Mhead *f;
int nxt, i;
ulong last, bestmid;
pg = p->pgrp;
rlock(&pg->ns);
nxt = 0;
bestmid = ~0;
last = 0;
if(mw->mh != nil)
last = mw->cm->mountid;
for(i = 0; i < MNTHASH; i++) {
for(f = pg->mnthash[i]; f != nil; f = f->hash) {
for(t = f->mount; t != nil; t = t->next) {
if(mw->mh == nil ||
(t->mountid > last && t->mountid < bestmid)) {
mw->cm = t;
mw->mh = f;
bestmid = mw->cm->mountid;
nxt = 1;
}
}
}
}
if(nxt == 0)
mw->mh = nil;
runlock(&pg->ns);
}
static long
procwrite(Chan *c, void *va, long n, vlong off)
{

View file

@ -17,7 +17,6 @@ typedef struct Logflag Logflag;
typedef struct Mntcache Mntcache;
typedef struct Mount Mount;
typedef struct Mntrpc Mntrpc;
typedef struct Mntwalk Mntwalk;
typedef struct Mnt Mnt;
typedef struct Mhead Mhead;
typedef struct Note Note;
@ -242,13 +241,6 @@ enum
NSCACHE = (1<<NSLOG),
};
struct Mntwalk /* state for /proc/#/ns */
{
int cddone;
Mhead* mh;
Mount* cm;
};
struct Mount
{
ulong mountid;

View file

@ -224,7 +224,7 @@ void printinit(void);
ulong procalarm(ulong);
void procctl(void);
void procdump(void);
int procfdprint(Chan*, int, int, char*, int);
int procfdprint(Chan*, int, char*, int);
void procflushseg(Segment*);
int procindex(ulong);
void procinit0(void);