plan9fox/sys/src/cmd/git/pack.c

1763 lines
34 KiB
C
Raw Normal View History

#include <u.h>
#include <libc.h>
#include <ctype.h>
#include "git.h"
typedef struct Buf Buf;
typedef struct Metavec Metavec;
typedef struct Meta Meta;
typedef struct Compout Compout;
typedef struct Packf Packf;
#define max(x, y) ((x) > (y) ? (x) : (y))
struct Metavec {
Meta **meta;
int nmeta;
int metasz;
};
struct Meta {
Object *obj;
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
vlong path;
vlong mtime;
/* The best delta we picked */
Meta *head;
Meta *prev;
Delta *delta;
int ndelta;
int nchain;
/* Only used for delta window */
Dtab dtab;
/* Only used for writing offset deltas */
vlong off;
};
struct Compout {
Biobuf *bfd;
DigestState *st;
};
struct Buf {
int len;
int sz;
int off;
char *data;
};
struct Packf {
char path[128];
char *idx;
vlong nidx;
int refs;
Biobuf *pack;
vlong opentm;
};
static int readpacked(Biobuf *, Object *, int);
static Object *readidxobject(Biobuf *, Hash, int);
Objset objcache;
Object *lruhead;
Object *lrutail;
vlong ncache;
vlong cachemax = 512*MiB;
Packf *packf;
int npackf;
int openpacks;
int gitdirmode = -1;
static void
clear(Object *o)
{
if(!o)
return;
assert(o->refs == 0);
assert((o->flag & Ccache) == 0);
assert(o->flag & Cloaded);
switch(o->type){
case GCommit:
if(!o->commit)
break;
free(o->commit->parent);
free(o->commit->author);
free(o->commit->committer);
free(o->commit);
o->commit = nil;
break;
case GTree:
if(!o->tree)
break;
free(o->tree->ent);
free(o->tree);
o->tree = nil;
break;
default:
break;
}
free(o->all);
o->all = nil;
o->data = nil;
o->flag &= ~(Cloaded|Cparsed);
}
void
unref(Object *o)
{
if(!o)
return;
o->refs--;
if(o->refs == 0)
clear(o);
}
Object*
ref(Object *o)
{
o->refs++;
return o;
}
void
cache(Object *o)
{
Object *p;
if(o == lruhead)
return;
if(o == lrutail)
lrutail = lrutail->prev;
if(!(o->flag & Cexist)){
osadd(&objcache, o);
o->id = objcache.nobj;
o->flag |= Cexist;
}
if(o->prev != nil)
o->prev->next = o->next;
if(o->next != nil)
o->next->prev = o->prev;
if(lrutail == o){
lrutail = o->prev;
if(lrutail != nil)
lrutail->next = nil;
}else if(lrutail == nil)
lrutail = o;
if(lruhead)
lruhead->prev = o;
o->next = lruhead;
o->prev = nil;
lruhead = o;
if(!(o->flag & Ccache)){
o->flag |= Ccache;
ref(o);
ncache += o->size;
}
while(ncache > cachemax && lrutail != nil){
p = lrutail;
lrutail = p->prev;
if(lrutail != nil)
lrutail->next = nil;
p->flag &= ~Ccache;
p->prev = nil;
p->next = nil;
ncache -= p->size;
unref(p);
}
}
static int
loadpack(Packf *pf, char *name)
{
char buf[128];
int i, ifd;
Dir *d;
memset(pf, 0, sizeof(Packf));
snprint(buf, sizeof(buf), ".git/objects/pack/%s.idx", name);
snprint(pf->path, sizeof(pf->path), ".git/objects/pack/%s.pack", name);
/*
* if we already have the pack open, just
* steal the loaded info
*/
for(i = 0; i < npackf; i++){
if(strcmp(pf->path, packf[i].path) == 0){
pf->pack = packf[i].pack;
pf->idx = packf[i].idx;
pf->nidx = packf[i].nidx;
packf[i].idx = nil;
packf[i].pack = nil;
return 0;
}
}
if((ifd = open(buf, OREAD)) == -1)
return -1;
if((d = dirfstat(ifd)) == nil){
close(ifd);
return -1;
}
pf->nidx = d->length;
pf->idx = emalloc(pf->nidx);
if(readn(ifd, pf->idx, pf->nidx) != pf->nidx){
close(ifd);
free(pf->idx);
free(d);
return -1;
}
close(ifd);
free(d);
return 0;
}
static void
refreshpacks(void)
{
Packf *pf, *new;
int i, n, l, nnew;
Dir *d;
if((n = slurpdir(".git/objects/pack", &d)) == -1)
return;
nnew = 0;
new = eamalloc(n, sizeof(Packf));
for(i = 0; i < n; i++){
l = strlen(d[i].name);
if(l > 4 && strcmp(d[i].name + l - 4, ".idx") != 0)
continue;
d[i].name[l - 4] = 0;
if(loadpack(&new[nnew], d[i].name) != -1)
nnew++;
}
for(i = 0; i < npackf; i++){
pf = &packf[i];
free(pf->idx);
if(pf->pack != nil)
Bterm(pf->pack);
}
free(packf);
packf = new;
npackf = nnew;
free(d);
}
static Biobuf*
openpack(Packf *pf)
{
vlong t;
int i, best;
if(pf->pack != nil){
pf->refs++;
return pf->pack;
}
/*
* If we've got more packs open
* than we want cached, try to
* free up the oldest ones.
*
* If we can't find a slot, this
* isn't fatal; we can just use
* another fd.
*/
while(openpacks >= Npackcache){
t = (1ull<<62)-1;
best = -1;
for(i = 0; i < npackf; i++){
if(&packf[i] != pf
&& packf[i].pack != nil
&& packf[i].opentm < t
&& packf[i].refs == 0){
t = packf[i].opentm;
best = i;
}
}
if(best == -1){
fprint(2, "no available pack slots\n");
break;
}
Bterm(packf[best].pack);
packf[best].pack = nil;
openpacks--;
}
openpacks++;
pf->opentm = nsec();
pf->refs++;
if((pf->pack = Bopen(pf->path, OREAD)) == nil)
return nil;
return pf->pack;
}
static void
closepack(Packf *pf)
{
pf->refs--;
}
static u32int
crc32(u32int crc, char *b, int nb)
{
static u32int crctab[256] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535,
0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d,
0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac,
0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab,
0xb6662d3d, 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb,
0x086d3d2d, 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea,
0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 0x4db26158, 0x3ab551ce,
0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409,
0xce61e49f, 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739,
0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, 0x8708a3d2, 0x1e01f268,
0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0,
0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8,
0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7,
0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae,
0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 0x88085ae6,
0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d,
0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
0x47b2cf7f, 0x30b5ffe9, 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
};
int i;
crc ^= 0xFFFFFFFF;
for(i = 0; i < nb; i++)
crc = (crc >> 8) ^ crctab[(crc ^ b[i]) & 0xFF];
return crc ^ 0xFFFFFFFF;
}
int
bappend(void *p, void *src, int len)
{
Buf *b = p;
char *n;
while(b->len + len >= b->sz){
b->sz = b->sz*2 + 64;
n = realloc(b->data, b->sz);
if(n == nil)
return -1;
b->data = n;
}
memmove(b->data + b->len, src, len);
b->len += len;
return len;
}
int
breadc(void *p)
{
return Bgetc(p);
}
int
bdecompress(Buf *d, Biobuf *b, vlong *csz)
{
vlong o;
o = Boffset(b);
if(inflatezlib(d, bappend, b, breadc) == -1 || d->data == nil){
free(d->data);
return -1;
}
if (csz)
*csz = Boffset(b) - o;
return d->len;
}
int
decompress(void **p, Biobuf *b, vlong *csz)
{
Buf d = {.len=0, .data=nil, .sz=0};
if(bdecompress(&d, b, csz) == -1){
free(d.data);
return -1;
}
*p = d.data;
return d.len;
}
static vlong
readvint(char *p, char **pp)
{
int s, c;
vlong n;
s = 0;
n = 0;
do {
c = *p++;
n |= (c & 0x7f) << s;
s += 7;
} while (c & 0x80 && s < 63);
*pp = p;
return n;
}
static int
applydelta(Object *dst, Object *base, char *d, int nd)
{
char *r, *b, *ed, *er;
int n, nr, c;
vlong o, l;
ed = d + nd;
b = base->data;
n = readvint(d, &d);
if(n != base->size){
werrstr("mismatched source size");
return -1;
}
nr = readvint(d, &d);
r = emalloc(nr + 64);
n = snprint(r, 64, "%T %d", base->type, nr) + 1;
dst->all = r;
dst->type = base->type;
dst->data = r + n;
dst->size = nr;
er = dst->data + nr;
r = dst->data;
while(d != ed){
c = *d++;
/* copy from base */
if(c & 0x80){
o = 0;
l = 0;
/* Offset in base */
if(d != ed && (c & 0x01)) o |= (*d++ << 0) & 0x000000ff;
if(d != ed && (c & 0x02)) o |= (*d++ << 8) & 0x0000ff00;
if(d != ed && (c & 0x04)) o |= (*d++ << 16) & 0x00ff0000;
if(d != ed && (c & 0x08)) o |= (*d++ << 24) & 0xff000000;
/* Length to copy */
if(d != ed && (c & 0x10)) l |= (*d++ << 0) & 0x0000ff;
if(d != ed && (c & 0x20)) l |= (*d++ << 8) & 0x00ff00;
if(d != ed && (c & 0x40)) l |= (*d++ << 16) & 0xff0000;
if(l == 0) l = 0x10000;
if(o + l > base->size){
werrstr("garbled delta: out of bounds copy");
return -1;
}
memmove(r, b + o, l);
r += l;
/* inline data */
}else{
if(c > ed - d){
werrstr("garbled delta: write past object");
return -1;
}
memmove(r, d, c);
d += c;
r += c;
}
}
if(r != er){
werrstr("truncated delta");
return -1;
}
return nr;
}
static int
readrdelta(Biobuf *f, Object *o, int nd, int flag)
{
Object *b;
Hash h;
char *d;
int n;
d = nil;
if(Bread(f, h.h, sizeof(h.h)) != sizeof(h.h))
goto error;
if(hasheq(&o->hash, &h))
goto error;
if((n = decompress(&d, f, nil)) == -1)
goto error;
o->len = Boffset(f) - o->off;
if(d == nil || n != nd)
goto error;
if((b = readidxobject(f, h, flag|Cthin)) == nil)
goto error;
if(applydelta(o, b, d, n) == -1)
goto error;
free(d);
return 0;
error:
free(d);
return -1;
}
static int
readodelta(Biobuf *f, Object *o, vlong nd, vlong p, int flag)
{
Object b;
char *d;
vlong r;
int c, n;
d = nil;
if((c = Bgetc(f)) == -1)
return -1;
r = c & 0x7f;
while(c & 0x80 && r < (1ULL<<56)){
if((c = Bgetc(f)) == -1)
return -1;
r = ((r + 1)<<7) | (c & 0x7f);
}
if(r > p || r >= (1ULL<<56)){
werrstr("junk offset -%lld (from %lld)", r, p);
goto error;
}
if((n = decompress(&d, f, nil)) == -1)
goto error;
o->len = Boffset(f) - o->off;
if(d == nil || n != nd)
goto error;
if(Bseek(f, p - r, 0) == -1)
goto error;
memset(&b, 0, sizeof(Object));
if(readpacked(f, &b, flag) == -1)
goto error;
if(applydelta(o, &b, d, nd) == -1)
goto error;
clear(&b);
free(d);
return 0;
error:
free(d);
return -1;
}
static int
readpacked(Biobuf *f, Object *o, int flag)
{
int c, s, n;
vlong l, p;
int t;
Buf b;
p = Boffset(f);
c = Bgetc(f);
if(c == -1)
return -1;
l = c & 0xf;
s = 4;
t = (c >> 4) & 0x7;
if(!t){
werrstr("unknown type for byte %x at %lld", c, p);
return -1;
}
while(c & 0x80){
if((c = Bgetc(f)) == -1)
return -1;
l |= (c & 0x7f) << s;
s += 7;
}
if(l >= (1ULL << 32)){
werrstr("object too big");
return -1;
}
switch(t){
default:
werrstr("invalid object at %lld", Boffset(f));
return -1;
case GCommit:
case GTree:
case GTag:
case GBlob:
b.sz = 64 + l;
b.data = emalloc(b.sz);
n = snprint(b.data, 64, "%T %lld", t, l) + 1;
b.len = n;
if(bdecompress(&b, f, nil) == -1){
free(b.data);
return -1;
}
o->len = Boffset(f) - o->off;
o->type = t;
o->all = b.data;
o->data = b.data + n;
o->size = b.len - n;
break;
case GOdelta:
if(readodelta(f, o, l, p, flag) == -1)
return -1;
break;
case GRdelta:
if(readrdelta(f, o, l, flag) == -1)
return -1;
break;
}
o->flag |= Cloaded|flag;
return 0;
}
static int
readloose(Biobuf *f, Object *o, int flag)
{
struct { char *tag; int type; } *p, types[] = {
{"blob", GBlob},
{"tree", GTree},
{"commit", GCommit},
{"tag", GTag},
{nil},
};
char *d, *s, *e;
vlong sz, n;
int l;
n = decompress(&d, f, nil);
if(n == -1)
return -1;
s = d;
o->type = GNone;
for(p = types; p->tag; p++){
l = strlen(p->tag);
if(strncmp(s, p->tag, l) == 0){
s += l;
o->type = p->type;
while(!isspace(*s))
s++;
break;
}
}
if(o->type == GNone){
free(o->data);
return -1;
}
sz = strtol(s, &e, 0);
if(e == s || *e++ != 0){
werrstr("malformed object header");
goto error;
}
if(sz != n - (e - d)){
werrstr("mismatched sizes");
goto error;
}
o->size = sz;
o->data = e;
o->all = d;
o->flag |= Cloaded|flag;
return 0;
error:
free(d);
return -1;
}
vlong
searchindex(char *idx, int nidx, Hash h)
{
int lo, hi, hidx, i, r, nent;
vlong o, oo;
void *s;
o = 8;
if(nidx < 8 + 256*4)
return -1;
/*
* Read the fanout table. The fanout table
* contains 256 entries, corresponsding to
* the first byte of the hash. Each entry
* is a 4 byte big endian integer, containing
* the total number of entries with a leading
* byte <= the table index, allowing us to
* rapidly do a binary search on them.
*/
if (h.h[0] == 0){
lo = 0;
hi = GETBE32(idx + o);
} else {
o += h.h[0]*4 - 4;
lo = GETBE32(idx + o);
hi = GETBE32(idx + o + 4);
}
if(hi == lo)
goto notfound;
nent=GETBE32(idx + 8 + 255*4);
/*
* Now that we know the range of hashes that the
* entry may exist in, search them
*/
r = -1;
hidx = -1;
o = 8 + 256*4;
while(lo < hi){
hidx = (hi + lo)/2;
s = idx + o + hidx*sizeof(h.h);
r = memcmp(h.h, s, sizeof(h.h));
if(r < 0)
hi = hidx;
else if(r > 0)
lo = hidx + 1;
else
break;
}
if(r != 0)
goto notfound;
/*
* We found the entry. If it's 32 bits, then we
* can just return the oset, otherwise the 32
* bit entry contains the oset to the 64 bit
* entry.
*/
oo = 8; /* Header */
oo += 256*4; /* Fanout table */
oo += Hashsz*nent; /* Hashes */
oo += 4*nent; /* Checksums */
oo += 4*hidx; /* Offset offset */
if(oo < 0 || oo + 4 > nidx)
goto err;
i = GETBE32(idx + oo);
o = i & 0xffffffffULL;
/*
* Large offsets (i.e. 64-bit) are encoded as an index
* into the next table with the MSB bit set.
*/
if(o & (1ull << 31)){
o &= 0x7fffffffULL;
oo = 8; /* Header */
oo += 256*4; /* Fanout table */
oo += Hashsz*nent; /* Hashes */
oo += 4*nent; /* Checksums */
oo += 4*nent; /* 32-bit Offsets */
oo += 8*o; /* 64-bit Offset offset */
if(oo < 0 || oo + 8 >= nidx)
goto err;
o = GETBE64(idx + oo);
}
return o;
err:
werrstr("out of bounds read");
return -1;
notfound:
werrstr("not present");
return -1;
}
/*
* Scans for non-empty word, copying it into buf.
* Strips off word, leading, and trailing space
* from input.
*
* Returns -1 on empty string or error, leaving
* input unmodified.
*/
static int
scanword(char **str, int *nstr, char *buf, int nbuf)
{
char *p;
int n, r;
r = -1;
p = *str;
n = *nstr;
while(n && isblank(*p)){
n--;
p++;
}
for(; n && *p && !isspace(*p); p++, n--){
r = 0;
*buf++ = *p;
nbuf--;
if(nbuf == 0)
return -1;
}
while(n && isblank(*p)){
n--;
p++;
}
*buf = 0;
*str = p;
*nstr = n;
return r;
}
static void
nextline(char **str, int *nstr)
{
char *s;
if((s = strchr(*str, '\n')) != nil){
*nstr -= s - *str + 1;
*str = s + 1;
}
}
static int
parseauthor(char **str, int *nstr, char **name, vlong *time)
{
char buf[128];
Resub m[4];
char *p;
int n, nm;
if((p = strchr(*str, '\n')) == nil)
sysfatal("malformed author line");
n = p - *str;
if(n >= sizeof(buf))
sysfatal("overlong author line");
memset(m, 0, sizeof(m));
snprint(buf, n + 1, *str);
*str = p;
*nstr -= n;
if(!regexec(authorpat, buf, m, nelem(m)))
sysfatal("invalid author line %s", buf);
nm = m[1].ep - m[1].sp;
*name = emalloc(nm + 1);
memcpy(*name, m[1].sp, nm);
buf[nm] = 0;
nm = m[2].ep - m[2].sp;
memcpy(buf, m[2].sp, nm);
buf[nm] = 0;
*time = atoll(buf);
return 0;
}
static void
parsecommit(Object *o)
{
char *p, *t, buf[128];
int np;
p = o->data;
np = o->size;
o->commit = emalloc(sizeof(Cinfo));
while(1){
if(scanword(&p, &np, buf, sizeof(buf)) == -1)
break;
if(strcmp(buf, "tree") == 0){
if(scanword(&p, &np, buf, sizeof(buf)) == -1)
sysfatal("invalid commit: tree missing");
if(hparse(&o->commit->tree, buf) == -1)
sysfatal("invalid commit: garbled tree");
}else if(strcmp(buf, "parent") == 0){
if(scanword(&p, &np, buf, sizeof(buf)) == -1)
sysfatal("invalid commit: missing parent");
o->commit->parent = realloc(o->commit->parent, ++o->commit->nparent * sizeof(Hash));
if(!o->commit->parent)
sysfatal("unable to malloc: %r");
if(hparse(&o->commit->parent[o->commit->nparent - 1], buf) == -1)
sysfatal("invalid commit: garbled parent");
}else if(strcmp(buf, "author") == 0){
parseauthor(&p, &np, &o->commit->author, &o->commit->mtime);
}else if(strcmp(buf, "committer") == 0){
parseauthor(&p, &np, &o->commit->committer, &o->commit->ctime);
}else if(strcmp(buf, "gpgsig") == 0){
/* just drop it */
if((t = strstr(p, "-----END PGP SIGNATURE-----")) == nil)
if((t = strstr(p, "-----END SSH SIGNATURE-----")) == nil)
sysfatal("malformed gpg signature");
np -= t - p;
p = t;
}
nextline(&p, &np);
}
while (np && isspace(*p)) {
p++;
np--;
}
o->commit->msg = p;
o->commit->nmsg = np;
}
static void
parsetree(Object *o)
{
int m, a, entsz, nent;
Dirent *t, *ent;
char *p, *ep;
p = o->data;
ep = p + o->size;
nent = 0;
entsz = 16;
ent = eamalloc(entsz, sizeof(Dirent));
o->tree = emalloc(sizeof(Tinfo));
while(p != ep){
if(nent == entsz){
entsz *= 2;
ent = earealloc(ent, entsz, sizeof(Dirent));
}
t = &ent[nent++];
m = strtol(p, &p, 8);
if(*p != ' ')
sysfatal("malformed tree %H: *p=(%d) %c\n", o->hash, *p, *p);
p++;
/*
* only the stored permissions for the user
* are relevant; git fills group and world
* bits with whatever -- so to serve with
* useful permissions, replicate the mode
* of the git repo dir.
*/
a = (m & 0777)>>6;
t->mode = ((a<<6)|(a<<3)|a) & gitdirmode;
t->ismod = 0;
t->islink = 0;
if(m == 0160000){
t->mode |= DMDIR;
t->ismod = 1;
}else if(m == 0120000){
t->mode = 0;
t->islink = 1;
}
if(m & 0040000)
t->mode |= DMDIR;
t->name = p;
p = memchr(p, 0, ep - p);
if(*p++ != 0 || ep - p < sizeof(t->h.h))
sysfatal("malformed tree %H, remaining %d (%s)", o->hash, (int)(ep - p), p);
memcpy(t->h.h, p, sizeof(t->h.h));
p += sizeof(t->h.h);
}
o->tree->ent = ent;
o->tree->nent = nent;
}
static void
parsetag(Object *)
{
}
void
parseobject(Object *o)
{
if(o->flag & Cparsed)
return;
switch(o->type){
case GTree: parsetree(o); break;
case GCommit: parsecommit(o); break;
case GTag: parsetag(o); break;
default: break;
}
o->flag |= Cparsed;
}
static Object*
readidxobject(Biobuf *idx, Hash h, int flag)
{
char path[Pathmax], hbuf[41];
Object *obj, *new;
int i, r, retried;
Biobuf *f;
vlong o;
if((obj = osfind(&objcache, h)) != nil){
if(flag & Cidx){
/*
* If we're indexing, we need to be careful
* to only return objects within this pack,
* so if the objects exist outside the pack,
* we don't index the wrong copy.
*/
if(!(obj->flag & Cidx))
return nil;
if(obj->flag & Cloaded)
return obj;
o = Boffset(idx);
if(Bseek(idx, obj->off, 0) == -1)
return nil;
if(readpacked(idx, obj, flag) == -1)
return nil;
if(Bseek(idx, o, 0) == -1)
sysfatal("could not restore offset");
cache(obj);
return obj;
}
if(obj->flag & Cloaded)
return obj;
}
if(flag & Cthin)
flag &= ~Cidx;
if(flag & Cidx)
return nil;
new = nil;
if(obj == nil){
new = emalloc(sizeof(Object));
new->id = objcache.nobj + 1;
new->hash = h;
obj = new;
}
o = -1;
retried = 0;
retry:
for(i = 0; i < npackf; i++){
o = searchindex(packf[i].idx, packf[i].nidx, h);
if(o != -1){
if((f = openpack(&packf[i])) == nil)
goto error;
if((r = Bseek(f, o, 0)) != -1)
r = readpacked(f, obj, flag);
closepack(&packf[i]);
if(r == -1)
goto error;
parseobject(obj);
cache(obj);
return obj;
}
}
snprint(hbuf, sizeof(hbuf), "%H", h);
snprint(path, sizeof(path), ".git/objects/%c%c/%s", hbuf[0], hbuf[1], hbuf + 2);
if((f = Bopen(path, OREAD)) != nil){
if(readloose(f, obj, flag) == -1)
goto errorf;
Bterm(f);
parseobject(obj);
cache(obj);
return obj;
}
if(o == -1){
if(retried)
goto error;
retried = 1;
refreshpacks();
goto retry;
}
errorf:
Bterm(f);
error:
free(new);
return nil;
}
/*
* Loads and returns a cached object.
*/
Object*
readobject(Hash h)
{
Object *o;
Dir *d;
if(gitdirmode == -1){
if((d = dirstat(".git")) == nil)
sysfatal("stat .git: %r");
gitdirmode = d->mode & 0777;
free(d);
}
if((o = readidxobject(nil, h, 0)) == nil)
return nil;
parseobject(o);
ref(o);
return o;
}
/*
* Creates and returns a cached, cleared object
* that will get loaded some other time. Useful
* for performance if need to mark that a blob
* exists, but we don't care about its contents.
*
* The refcount of the returned object is 0, so
* it doesn't need to be unrefed.
*/
Object*
clearedobject(Hash h, int type)
{
Object *o;
if((o = osfind(&objcache, h)) != nil)
return o;
o = emalloc(sizeof(Object));
o->hash = h;
o->type = type;
osadd(&objcache, o);
o->id = objcache.nobj;
o->flag |= Cexist;
return o;
}
int
objcmp(void *pa, void *pb)
{
Object *a, *b;
a = *(Object**)pa;
b = *(Object**)pb;
return memcmp(a->hash.h, b->hash.h, sizeof(a->hash.h));
}
static int
hwrite(Biobuf *b, void *buf, int len, DigestState **st)
{
*st = sha1(buf, len, nil, *st);
return Bwrite(b, buf, len);
}
static u32int
objectcrc(Biobuf *f, Object *o)
{
char buf[8096];
int n, r;
o->crc = 0;
Bseek(f, o->off, 0);
for(n = o->len; n > 0; n -= r){
r = Bread(f, buf, n > sizeof(buf) ? sizeof(buf) : n);
if(r == -1)
return -1;
if(r == 0)
return 0;
o->crc = crc32(o->crc, buf, r);
}
return 0;
}
int
indexpack(char *pack, char *idx, Hash ph)
{
char hdr[4*3], buf[8];
int nobj, npct, nvalid, nbig;
int i, n, pct;
Object *o, **obj;
DigestState *st;
char *valid;
Biobuf *f;
Hash h;
int c;
if((f = Bopen(pack, OREAD)) == nil)
return -1;
if(Bread(f, hdr, sizeof(hdr)) != sizeof(hdr)){
werrstr("short read on header");
return -1;
}
if(memcmp(hdr, "PACK\0\0\0\2", 8) != 0){
werrstr("invalid header");
return -1;
}
pct = 0;
npct = 0;
nvalid = 0;
nobj = GETBE32(hdr + 8);
obj = eamalloc(nobj, sizeof(Object*));
valid = eamalloc(nobj, sizeof(char));
if(interactive)
fprint(2, "indexing %d objects: 0%%", nobj);
while(nvalid != nobj){
n = 0;
for(i = 0; i < nobj; i++){
if(valid[i]){
n++;
continue;
}
pct = showprogress((npct*100)/nobj, pct);
if(obj[i] == nil){
o = emalloc(sizeof(Object));
o->off = Boffset(f);
obj[i] = o;
}
o = obj[i];
/*
* We can seek around when packing delta chains.
* Be extra careful while we don't know where all
* the objects start.
*/
Bseek(f, o->off, 0);
if(readpacked(f, o, Cidx) == -1)
continue;
sha1((uchar*)o->all, o->size + strlen(o->all) + 1, o->hash.h, nil);
valid[i] = 1;
cache(o);
npct++;
n++;
if(objectcrc(f, o) == -1)
return -1;
}
if(n == nvalid){
sysfatal("fix point reached too early: %d/%d: %r", nvalid, nobj);
goto error;
}
nvalid = n;
}
if(interactive)
fprint(2, "\b\b\b\b100%%\n");
Bterm(f);
st = nil;
qsort(obj, nobj, sizeof(Object*), objcmp);
if((f = Bopen(idx, OWRITE)) == nil)
return -1;
if(hwrite(f, "\xfftOc\x00\x00\x00\x02", 8, &st) != 8)
goto error;
/* fanout table */
c = 0;
for(i = 0; i < 256; i++){
while(c < nobj && (obj[c]->hash.h[0] & 0xff) <= i)
c++;
PUTBE32(buf, c);
if(hwrite(f, buf, 4, &st) == -1)
goto error;
}
for(i = 0; i < nobj; i++){
o = obj[i];
if(hwrite(f, o->hash.h, sizeof(o->hash.h), &st) == -1)
goto error;
}
for(i = 0; i < nobj; i++){
PUTBE32(buf, obj[i]->crc);
if(hwrite(f, buf, 4, &st) == -1)
goto error;
}
nbig = 0;
for(i = 0; i < nobj; i++){
if(obj[i]->off < (1ull<<31))
PUTBE32(buf, obj[i]->off);
else{
PUTBE32(buf, (1ull << 31) | nbig);
nbig++;
}
if(hwrite(f, buf, 4, &st) == -1)
goto error;
}
for(i = 0; i < nobj; i++){
if(obj[i]->off >= (1ull<<31)){
PUTBE64(buf, obj[i]->off);
if(hwrite(f, buf, 8, &st) == -1)
goto error;
}
}
if(hwrite(f, ph.h, sizeof(ph.h), &st) == -1)
goto error;
sha1(nil, 0, h.h, st);
Bwrite(f, h.h, sizeof(h.h));
free(obj);
free(valid);
Bterm(f);
return 0;
error:
free(obj);
free(valid);
Bterm(f);
return -1;
}
static int
deltaordercmp(void *pa, void *pb)
{
Meta *a, *b;
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
vlong cmp;
a = *(Meta**)pa;
b = *(Meta**)pb;
if(a->obj->type != b->obj->type)
return a->obj->type - b->obj->type;
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
cmp = (b->path - a->path);
if(cmp != 0)
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
return (cmp < 0) ? -1 : 1;
cmp = a->mtime - b->mtime;
if(cmp != 0)
return (cmp < 0) ? -1 : 1;
return memcmp(a->obj->hash.h, b->obj->hash.h, sizeof(a->obj->hash.h));
}
static int
writeordercmp(void *pa, void *pb)
{
Meta *a, *b, *ahd, *bhd;
a = *(Meta**)pa;
b = *(Meta**)pb;
ahd = (a->head == nil) ? a : a->head;
bhd = (b->head == nil) ? b : b->head;
if(ahd->mtime != bhd->mtime)
return bhd->mtime - ahd->mtime;
if(ahd != bhd)
return (uintptr)bhd - (uintptr)ahd;
if(a->nchain != b->nchain)
return a->nchain - b->nchain;
return a->mtime - b->mtime;
}
static void
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
addmeta(Metavec *v, Objset *has, Object *o, vlong pathid, vlong mtime)
{
Meta *m;
if(oshas(has, o->hash))
return;
osadd(has, o);
if(v == nil)
return;
m = emalloc(sizeof(Meta));
m->obj = o;
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
m->path = pathid;
m->mtime = mtime;
if(v->nmeta == v->metasz){
v->metasz = 2*v->metasz;
v->meta = earealloc(v->meta, v->metasz, sizeof(Meta*));
}
v->meta[v->nmeta++] = m;
}
static void
freemeta(Meta *m)
{
free(m->delta);
free(m);
}
static int
loadtree(Metavec *v, Objset *has, Hash tree, char *dpath, vlong mtime)
{
Object *t, *o;
Dirent *e;
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
vlong dh, eh;
int i, k, r;
char *p;
if(oshas(has, tree))
return 0;
if((t = readobject(tree)) == nil)
return -1;
if(t->type != GTree){
fprint(2, "load: %H: not tree\n", t->hash);
unref(t);
return 0;
}
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
dh = murmurhash2(dpath, strlen(dpath));
addmeta(v, has, t, dh, mtime);
for(i = 0; i < t->tree->nent; i++){
e = &t->tree->ent[i];
if(oshas(has, e->h))
continue;
if(e->ismod)
continue;
k = (e->mode & DMDIR) ? GTree : GBlob;
o = clearedobject(e->h, k);
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
if(k == GTree){
p = smprint("%s/%s", dpath, e->name);
r = loadtree(v, has, e->h, p, mtime);
free(p);
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
if(r == -1)
return -1;
}else{
eh = murmurhash2(e->name, strlen(e->name));
addmeta(v, has, o, dh^eh, mtime);
}
}
unref(t);
return 0;
}
static int
loadcommit(Metavec *v, Objset *has, Hash h)
{
Object *c;
int r;
if(osfind(has, h))
return 0;
if((c = readobject(h)) == nil)
return -1;
if(c->type != GCommit){
fprint(2, "load: %H: not commit\n", c->hash);
unref(c);
return 0;
}
git: performance enhancements Inspired by some changes made in game of trees, I've implemented a number of speedups in git9. First, hashing the chunks during deltification with murmurhash instead of sha1 speeds up the delta search significantly. The stretch function was micro-optimized a bit as well, since that was taking a large portion of the time when chunking. Finally, the full path is not stored. We only care about grouping files with the same name and path. We don't care about the ordering. Therefore, only the hash of the path xored with the hash of the diretory is kept, which saves a bunch of mallocs and string munging. This reduces the time spent repacking some test repos significantly. 9front: % time git/repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 58.85u 1.39s 61.82r git/repack % time /sys/src/cmd/git/6.repack deltifying 97473 objects: 100% writing 97473 objects: 100% indexing 97473 objects: 100% 43.86u 1.29s 47.51r /sys/src/cmd/git/6.repack openbsd: % time git/repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1589.48u 45.03s 1729.18r git/repack % time /sys/src/cmd/git/6.repack deltifying 2092325 objects: 100% writing 2092325 objects: 100% indexing 2092325 objects: 100% 1238.68u 41.49s 1373.15r /sys/src/cmd/git/6.repack go: % time git/repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 345.32u 7.71s 369.25r git/repack % time /sys/src/cmd/git/6.repack deltifying 529507 objects: 100% writing 529507 objects: 100% indexing 529507 objects: 100% 248.07u 4.47s 257.59r /sys/src/cmd/git/6.repack
2022-05-28 16:38:07 +00:00
addmeta(v, has, c, 0, c->commit->ctime);
r = loadtree(v, has, c->commit->tree, "", c->commit->ctime);
unref(c);
return r;
}
static int
readmeta(Hash *theirs, int ntheirs, Hash *ours, int nours, Meta ***m)
{
Object **obj;
Objset has;
int i, nobj;
Metavec v;
*m = nil;
osinit(&has);
v.nmeta = 0;
v.metasz = 64;
v.meta = eamalloc(v.metasz, sizeof(Meta*));
if(findtwixt(theirs, ntheirs, ours, nours, &obj, &nobj) == -1)
sysfatal("load twixt: %r");
if(nobj == 0)
return 0;
for(i = 0; i < nours; i++)
if(!hasheq(&ours[i], &Zhash))
if(loadcommit(nil, &has, ours[i]) == -1)
goto out;
for(i = 0; i < nobj; i++)
if(loadcommit(&v, &has, obj[i]->hash) == -1)
goto out;
osclear(&has);
*m = v.meta;
return v.nmeta;
out:
osclear(&has);
free(v.meta);
return -1;
}
static int
deltasz(Delta *d, int nd)
{
int i, sz;
sz = 32;
for(i = 0; i < nd; i++)
sz += d[i].cpy ? 7 : d[i].len + 1;
return sz;
}
static void
pickdeltas(Meta **meta, int nmeta)
{
Meta *m, *p;
Object *o;
Delta *d;
int i, j, nd, sz, pct, best;
pct = 0;
dprint(1, "picking deltas\n");
if(interactive)
fprint(2, "deltifying %d objects: 0%%", nmeta);
qsort(meta, nmeta, sizeof(Meta*), deltaordercmp);
for(i = 0; i < nmeta; i++){
m = meta[i];
pct = showprogress((i*100) / nmeta, pct);
m->delta = nil;
m->ndelta = 0;
if(m->obj->type == GCommit || m->obj->type == GTag)
continue;
if((o = readobject(m->obj->hash)) == nil)
sysfatal("readobject %H: %r", m->obj->hash);
dtinit(&m->dtab, o);
if(i >= 11)
dtclear(&meta[i-11]->dtab);
best = o->size;
for(j = max(0, i - 10); j < i; j++){
p = meta[j];
/* long chains make unpacking slow */
if(p->nchain >= 128 || p->obj->type != o->type)
continue;
d = deltify(o, &p->dtab, &nd);
sz = deltasz(d, nd);
if(sz + 32 < best){
/*
* if we already picked a best delta,
* replace it.
*/
free(m->delta);
best = sz;
m->delta = d;
m->ndelta = nd;
m->nchain = p->nchain + 1;
m->prev = p;
m->head = p->head;
if(m->head == nil)
m->head = p;
}else
free(d);
}
unref(o);
}
for(i = max(0, nmeta - 10); i < nmeta; i++)
dtclear(&meta[i]->dtab);
if(interactive)
fprint(2, "\b\b\b\b100%%\n");
}
static int
compread(void *p, void *dst, int n)
{
Buf *b;
b = p;
if(n > b->sz - b->off)
n = b->sz - b->off;
memcpy(dst, b->data + b->off, n);
b->off += n;
return n;
}
static int
compwrite(void *p, void *buf, int n)
{
return hwrite(((Compout *)p)->bfd, buf, n, &((Compout*)p)->st);
}
static int
hcompress(Biobuf *bfd, void *buf, int sz, DigestState **st)
{
int r;
Buf b ={
.off=0,
.data=buf,
.sz=sz,
};
Compout o = {
.bfd = bfd,
.st = *st,
};
r = deflatezlib(&o, compwrite, &b, compread, 6, 0);
*st = o.st;
return r;
}
static void
append(char **p, int *len, int *sz, void *seg, int nseg)
{
if(*len + nseg >= *sz){
while(*len + nseg >= *sz)
*sz += *sz/2;
*p = erealloc(*p, *sz);
}
memcpy(*p + *len, seg, nseg);
*len += nseg;
}
static int
encodedelta(Meta *m, Object *o, Object *b, void **pp)
{
char *p, *bp, buf[16];
int len, sz, n, i, j;
Delta *d;
sz = 128;
len = 0;
p = emalloc(sz);
/* base object size */
buf[0] = b->size & 0x7f;
n = b->size >> 7;
for(i = 1; n > 0; i++){
buf[i - 1] |= 0x80;
buf[i] = n & 0x7f;
n >>= 7;
}
append(&p, &len, &sz, buf, i);
/* target object size */
buf[0] = o->size & 0x7f;
n = o->size >> 7;
for(i = 1; n > 0; i++){
buf[i - 1] |= 0x80;
buf[i] = n & 0x7f;
n >>= 7;
}
append(&p, &len, &sz, buf, i);
for(j = 0; j < m->ndelta; j++){
d = &m->delta[j];
if(d->cpy){
n = d->off;
bp = buf + 1;
buf[0] = 0x81;
buf[1] = 0x00;
for(i = 0; i < sizeof(buf); i++) {
buf[0] |= 1<<i;
*bp++ = n & 0xff;
n >>= 8;
if(n == 0)
break;
}
n = d->len;
if(n != 0x10000) {
buf[0] |= 0x1<<4;
for(i = 0; i < sizeof(buf)-4 && n > 0; i++){
buf[0] |= 1<<(i + 4);
*bp++ = n & 0xff;
n >>= 8;
}
}
append(&p, &len, &sz, buf, bp - buf);
}else{
n = 0;
while(n != d->len){
buf[0] = (d->len - n < 127) ? d->len - n : 127;
append(&p, &len, &sz, buf, 1);
append(&p, &len, &sz, o->data + d->off + n, buf[0]);
n += buf[0];
}
}
}
*pp = p;
return len;
}
static int
packhdr(char *hdr, int ty, int len)
{
int i;
hdr[0] = ty << 4;
hdr[0] |= len & 0xf;
len >>= 4;
for(i = 1; len != 0; i++){
hdr[i-1] |= 0x80;
hdr[i] = len & 0x7f;
len >>= 7;
}
return i;
}
static int
packoff(char *hdr, vlong off)
{
int i, j;
char rbuf[8];
rbuf[0] = off & 0x7f;
for(i = 1; (off >>= 7) != 0; i++)
rbuf[i] = (--off & 0x7f)|0x80;
j = 0;
while(i > 0)
hdr[j++] = rbuf[--i];
return j;
}
static int
genpack(int fd, Meta **meta, int nmeta, Hash *h, int odelta)
{
int i, nh, nd, res, pct, ret;
DigestState *st;
Biobuf *bfd;
Meta *m;
Object *o, *po, *b;
char *p, buf[32];
st = nil;
ret = -1;
pct = 0;
dprint(1, "generating pack\n");
if((fd = dup(fd, -1)) == -1)
return -1;
if((bfd = Bfdopen(fd, OWRITE)) == nil)
return -1;
if(hwrite(bfd, "PACK", 4, &st) == -1)
return -1;
PUTBE32(buf, 2);
if(hwrite(bfd, buf, 4, &st) == -1)
return -1;
PUTBE32(buf, nmeta);
if(hwrite(bfd, buf, 4, &st) == -1)
return -1;
qsort(meta, nmeta, sizeof(Meta*), writeordercmp);
if(interactive)
fprint(2, "writing %d objects: 0%%", nmeta);
for(i = 0; i < nmeta; i++){
pct = showprogress((i*100)/nmeta, pct);
m = meta[i];
m->off = Boffset(bfd);
if(m->off == -1)
goto error;
if((o = readobject(m->obj->hash)) == nil)
return -1;
if(m->delta == nil){
nh = packhdr(buf, o->type, o->size);
if(hwrite(bfd, buf, nh, &st) == -1)
goto error;
if(hcompress(bfd, o->data, o->size, &st) == -1)
goto error;
}else{
if((b = readobject(m->prev->obj->hash)) == nil)
goto error;
nd = encodedelta(m, o, b, &p);
unref(b);
if(odelta && m->prev->off != 0){
nh = 0;
nh += packhdr(buf, GOdelta, nd);
nh += packoff(buf+nh, m->off - m->prev->off);
if(hwrite(bfd, buf, nh, &st) == -1)
goto error;
}else{
nh = packhdr(buf, GRdelta, nd);
po = m->prev->obj;
if(hwrite(bfd, buf, nh, &st) == -1)
goto error;
if(hwrite(bfd, po->hash.h, sizeof(po->hash.h), &st) == -1)
goto error;
}
res = hcompress(bfd, p, nd, &st);
free(p);
if(res == -1)
goto error;
}
unref(o);
}
if(interactive)
fprint(2, "\b\b\b\b100%%\n");
sha1(nil, 0, h->h, st);
if(Bwrite(bfd, h->h, sizeof(h->h)) == -1)
goto error;
ret = 0;
error:
if(Bterm(bfd) == -1)
return -1;
return ret;
}
int
writepack(int fd, Hash *theirs, int ntheirs, Hash *ours, int nours, Hash *h)
{
Meta **meta;
int i, r, nmeta;
if((nmeta = readmeta(theirs, ntheirs, ours, nours, &meta)) == -1)
return -1;
pickdeltas(meta, nmeta);
r = genpack(fd, meta, nmeta, h, 0);
for(i = 0; i < nmeta; i++)
freemeta(meta[i]);
free(meta);
return r;
}