feat: Add badgerDB support. (#353)

See https://github.com/dgraph-io/badger Slide: https://github.com/gopherchina/conference/blob/master/2018/1.5%20Badger_%20Fast%20Key-Value%20DB%20in%20Go.pdf
2018-04-16 17:26:15 +08:00
parent 069efa9e5e
commit ffa8eb12b3
65 changed files with 13049 additions and 1 deletions
--- a/vendor/github.com/dgraph-io/badger/table/README.md
+++ b/vendor/github.com/dgraph-io/badger/table/README.md
@@ -0,0 +1,51 @@
+# BenchmarkRead
+
+```
+$ go test -bench Read$ -count 3
+
+Size of table: 105843444
+BenchmarkRead-8   	3	 343846914 ns/op
+BenchmarkRead-8   	3	 351790907 ns/op
+BenchmarkRead-8   	3	 351762823 ns/op
+```
+
+Size of table is 105,843,444 bytes, which is ~101M.
+
+The rate is ~287M/s which matches our read speed. This is using mmap.
+
+To read a 64M table, this would take ~0.22s, which is negligible.
+
+```
+$ go test -bench BenchmarkReadAndBuild -count 3
+
+BenchmarkReadAndBuild-8   	       1	2341034225 ns/op
+BenchmarkReadAndBuild-8   	       1	2346349671 ns/op
+BenchmarkReadAndBuild-8   	       1	2364064576 ns/op
+```
+
+The rate is ~43M/s. To build a ~64M table, this would take ~1.5s. Note that this
+does NOT include the flushing of the table to disk. All we are doing above is
+to read one table (mmaped) and write one table in memory.
+
+The table building takes 1.5-0.22 ~ 1.3s.
+
+If we are writing out up to 10 tables, this would take 1.5*10 ~ 15s, and ~13s
+is spent building the tables.
+
+When running populate, building one table in memory tends to take ~1.5s to ~2.5s
+on my system. Where does this overhead come from? Let's investigate the merging.
+
+Below, we merge 5 tables. The total size remains unchanged at ~101M.
+
+```
+$ go test -bench ReadMerged -count 3
+BenchmarkReadMerged-8   	       1	1321190264 ns/op
+BenchmarkReadMerged-8   	       1	1296958737 ns/op
+BenchmarkReadMerged-8   	       1	1314381178 ns/op
+```
+
+The rate is ~76M/s. To build a 64M table, this would take ~0.84s. The writing
+takes ~1.3s as we saw above. So in total, we expect around 0.84+1.3 ~ 2.1s.
+This roughly matches what we observe when running populate. There might be
+some additional overhead due to the concurrent writes going on, in flushing the
+table to disk. Also, the tables tend to be slightly bigger than 64M/s.
--- a/vendor/github.com/dgraph-io/badger/table/builder.go
+++ b/vendor/github.com/dgraph-io/badger/table/builder.go
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package table
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"math"
+
+	"github.com/AndreasBriese/bbloom"
+	"github.com/dgraph-io/badger/y"
+)
+
+var (
+	restartInterval = 100 // Might want to change this to be based on total size instead of numKeys.
+)
+
+func newBuffer(sz int) *bytes.Buffer {
+	b := new(bytes.Buffer)
+	b.Grow(sz)
+	return b
+}
+
+type header struct {
+	plen uint16 // Overlap with base key.
+	klen uint16 // Length of the diff.
+	vlen uint16 // Length of value.
+	prev uint32 // Offset for the previous key-value pair. The offset is relative to block base offset.
+}
+
+// Encode encodes the header.
+func (h header) Encode(b []byte) {
+	binary.BigEndian.PutUint16(b[0:2], h.plen)
+	binary.BigEndian.PutUint16(b[2:4], h.klen)
+	binary.BigEndian.PutUint16(b[4:6], h.vlen)
+	binary.BigEndian.PutUint32(b[6:10], h.prev)
+}
+
+// Decode decodes the header.
+func (h *header) Decode(buf []byte) int {
+	h.plen = binary.BigEndian.Uint16(buf[0:2])
+	h.klen = binary.BigEndian.Uint16(buf[2:4])
+	h.vlen = binary.BigEndian.Uint16(buf[4:6])
+	h.prev = binary.BigEndian.Uint32(buf[6:10])
+	return h.Size()
+}
+
+// Size returns size of the header. Currently it's just a constant.
+func (h header) Size() int { return 10 }
+
+// Builder is used in building a table.
+type Builder struct {
+	counter int // Number of keys written for the current block.
+
+	// Typically tens or hundreds of meg. This is for one single file.
+	buf *bytes.Buffer
+
+	baseKey    []byte // Base key for the current block.
+	baseOffset uint32 // Offset for the current block.
+
+	restarts []uint32 // Base offsets of every block.
+
+	// Tracks offset for the previous key-value pair. Offset is relative to block base offset.
+	prevOffset uint32
+
+	keyBuf   *bytes.Buffer
+	keyCount int
+}
+
+// NewTableBuilder makes a new TableBuilder.
+func NewTableBuilder() *Builder {
+	return &Builder{
+		keyBuf:     newBuffer(1 << 20),
+		buf:        newBuffer(1 << 20),
+		prevOffset: math.MaxUint32, // Used for the first element!
+	}
+}
+
+// Close closes the TableBuilder.
+func (b *Builder) Close() {}
+
+// Empty returns whether it's empty.
+func (b *Builder) Empty() bool { return b.buf.Len() == 0 }
+
+// keyDiff returns a suffix of newKey that is different from b.baseKey.
+func (b Builder) keyDiff(newKey []byte) []byte {
+	var i int
+	for i = 0; i < len(newKey) && i < len(b.baseKey); i++ {
+		if newKey[i] != b.baseKey[i] {
+			break
+		}
+	}
+	return newKey[i:]
+}
+
+func (b *Builder) addHelper(key []byte, v y.ValueStruct) {
+	// Add key to bloom filter.
+	if len(key) > 0 {
+		var klen [2]byte
+		keyNoTs := y.ParseKey(key)
+		binary.BigEndian.PutUint16(klen[:], uint16(len(keyNoTs)))
+		b.keyBuf.Write(klen[:])
+		b.keyBuf.Write(keyNoTs)
+		b.keyCount++
+	}
+
+	// diffKey stores the difference of key with baseKey.
+	var diffKey []byte
+	if len(b.baseKey) == 0 {
+		// Make a copy. Builder should not keep references. Otherwise, caller has to be very careful
+		// and will have to make copies of keys every time they add to builder, which is even worse.
+		b.baseKey = append(b.baseKey[:0], key...)
+		diffKey = key
+	} else {
+		diffKey = b.keyDiff(key)
+	}
+
+	h := header{
+		plen: uint16(len(key) - len(diffKey)),
+		klen: uint16(len(diffKey)),
+		vlen: uint16(v.EncodedSize()),
+		prev: b.prevOffset, // prevOffset is the location of the last key-value added.
+	}
+	b.prevOffset = uint32(b.buf.Len()) - b.baseOffset // Remember current offset for the next Add call.
+
+	// Layout: header, diffKey, value.
+	var hbuf [10]byte
+	h.Encode(hbuf[:])
+	b.buf.Write(hbuf[:])
+	b.buf.Write(diffKey) // We only need to store the key difference.
+
+	v.EncodeTo(b.buf)
+	b.counter++ // Increment number of keys added for this current block.
+}
+
+func (b *Builder) finishBlock() {
+	// When we are at the end of the block and Valid=false, and the user wants to do a Prev,
+	// we need a dummy header to tell us the offset of the previous key-value pair.
+	b.addHelper([]byte{}, y.ValueStruct{})
+}
+
+// Add adds a key-value pair to the block.
+// If doNotRestart is true, we will not restart even if b.counter >= restartInterval.
+func (b *Builder) Add(key []byte, value y.ValueStruct) error {
+	if b.counter >= restartInterval {
+		b.finishBlock()
+		// Start a new block. Initialize the block.
+		b.restarts = append(b.restarts, uint32(b.buf.Len()))
+		b.counter = 0
+		b.baseKey = []byte{}
+		b.baseOffset = uint32(b.buf.Len())
+		b.prevOffset = math.MaxUint32 // First key-value pair of block has header.prev=MaxInt.
+	}
+	b.addHelper(key, value)
+	return nil // Currently, there is no meaningful error.
+}
+
+// TODO: vvv this was the comment on ReachedCapacity.
+// FinalSize returns the *rough* final size of the array, counting the header which is not yet written.
+// TODO: Look into why there is a discrepancy. I suspect it is because of Write(empty, empty)
+// at the end. The diff can vary.
+
+// ReachedCapacity returns true if we... roughly (?) reached capacity?
+func (b *Builder) ReachedCapacity(cap int64) bool {
+	estimateSz := b.buf.Len() + 8 /* empty header */ + 4*len(b.restarts) + 8 // 8 = end of buf offset + len(restarts).
+	return int64(estimateSz) > cap
+}
+
+// blockIndex generates the block index for the table.
+// It is mainly a list of all the block base offsets.
+func (b *Builder) blockIndex() []byte {
+	// Store the end offset, so we know the length of the final block.
+	b.restarts = append(b.restarts, uint32(b.buf.Len()))
+
+	// Add 4 because we want to write out number of restarts at the end.
+	sz := 4*len(b.restarts) + 4
+	out := make([]byte, sz)
+	buf := out
+	for _, r := range b.restarts {
+		binary.BigEndian.PutUint32(buf[:4], r)
+		buf = buf[4:]
+	}
+	binary.BigEndian.PutUint32(buf[:4], uint32(len(b.restarts)))
+	return out
+}
+
+// Finish finishes the table by appending the index.
+func (b *Builder) Finish() []byte {
+	bf := bbloom.New(float64(b.keyCount), 0.01)
+	var klen [2]byte
+	key := make([]byte, 1024)
+	for {
+		if _, err := b.keyBuf.Read(klen[:]); err == io.EOF {
+			break
+		} else if err != nil {
+			y.Check(err)
+		}
+		kl := int(binary.BigEndian.Uint16(klen[:]))
+		if cap(key) < kl {
+			key = make([]byte, 2*int(kl)) // 2 * uint16 will overflow
+		}
+		key = key[:kl]
+		y.Check2(b.keyBuf.Read(key))
+		bf.Add(key)
+	}
+
+	b.finishBlock() // This will never start a new block.
+	index := b.blockIndex()
+	b.buf.Write(index)
+
+	// Write bloom filter.
+	bdata := bf.JSONMarshal()
+	n, err := b.buf.Write(bdata)
+	y.Check(err)
+	var buf [4]byte
+	binary.BigEndian.PutUint32(buf[:], uint32(n))
+	b.buf.Write(buf[:])
+
+	return b.buf.Bytes()
+}
--- a/vendor/github.com/dgraph-io/badger/table/iterator.go
+++ b/vendor/github.com/dgraph-io/badger/table/iterator.go
@@ -0,0 +1,539 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package table
+
+import (
+	"bytes"
+	"io"
+	"math"
+	"sort"
+
+	"github.com/dgraph-io/badger/y"
+	"github.com/pkg/errors"
+)
+
+type blockIterator struct {
+	data    []byte
+	pos     uint32
+	err     error
+	baseKey []byte
+
+	key  []byte
+	val  []byte
+	init bool
+
+	last header // The last header we saw.
+}
+
+func (itr *blockIterator) Reset() {
+	itr.pos = 0
+	itr.err = nil
+	itr.baseKey = []byte{}
+	itr.key = []byte{}
+	itr.val = []byte{}
+	itr.init = false
+	itr.last = header{}
+}
+
+func (itr *blockIterator) Init() {
+	if !itr.init {
+		itr.Next()
+	}
+}
+
+func (itr *blockIterator) Valid() bool {
+	return itr != nil && itr.err == nil
+}
+
+func (itr *blockIterator) Error() error {
+	return itr.err
+}
+
+func (itr *blockIterator) Close() {}
+
+var (
+	origin  = 0
+	current = 1
+)
+
+// Seek brings us to the first block element that is >= input key.
+func (itr *blockIterator) Seek(key []byte, whence int) {
+	itr.err = nil
+
+	switch whence {
+	case origin:
+		itr.Reset()
+	case current:
+	}
+
+	var done bool
+	for itr.Init(); itr.Valid(); itr.Next() {
+		k := itr.Key()
+		if y.CompareKeys(k, key) >= 0 {
+			// We are done as k is >= key.
+			done = true
+			break
+		}
+	}
+	if !done {
+		itr.err = io.EOF
+	}
+}
+
+func (itr *blockIterator) SeekToFirst() {
+	itr.err = nil
+	itr.Init()
+}
+
+// SeekToLast brings us to the last element. Valid should return true.
+func (itr *blockIterator) SeekToLast() {
+	itr.err = nil
+	for itr.Init(); itr.Valid(); itr.Next() {
+	}
+	itr.Prev()
+}
+
+// parseKV would allocate a new byte slice for key and for value.
+func (itr *blockIterator) parseKV(h header) {
+	if cap(itr.key) < int(h.plen+h.klen) {
+		sz := int(h.plen) + int(h.klen) // Convert to int before adding to avoid uint16 overflow.
+		itr.key = make([]byte, 2*sz)
+	}
+	itr.key = itr.key[:h.plen+h.klen]
+	copy(itr.key, itr.baseKey[:h.plen])
+	copy(itr.key[h.plen:], itr.data[itr.pos:itr.pos+uint32(h.klen)])
+	itr.pos += uint32(h.klen)
+
+	if itr.pos+uint32(h.vlen) > uint32(len(itr.data)) {
+		itr.err = errors.Errorf("Value exceeded size of block: %d %d %d %d %v",
+			itr.pos, h.klen, h.vlen, len(itr.data), h)
+		return
+	}
+	itr.val = y.SafeCopy(itr.val, itr.data[itr.pos:itr.pos+uint32(h.vlen)])
+	itr.pos += uint32(h.vlen)
+}
+
+func (itr *blockIterator) Next() {
+	itr.init = true
+	itr.err = nil
+	if itr.pos >= uint32(len(itr.data)) {
+		itr.err = io.EOF
+		return
+	}
+
+	var h header
+	itr.pos += uint32(h.Decode(itr.data[itr.pos:]))
+	itr.last = h // Store the last header.
+
+	if h.klen == 0 && h.plen == 0 {
+		// Last entry in the table.
+		itr.err = io.EOF
+		return
+	}
+
+	// Populate baseKey if it isn't set yet. This would only happen for the first Next.
+	if len(itr.baseKey) == 0 {
+		// This should be the first Next() for this block. Hence, prefix length should be zero.
+		y.AssertTrue(h.plen == 0)
+		itr.baseKey = itr.data[itr.pos : itr.pos+uint32(h.klen)]
+	}
+	itr.parseKV(h)
+}
+
+func (itr *blockIterator) Prev() {
+	if !itr.init {
+		return
+	}
+	itr.err = nil
+	if itr.last.prev == math.MaxUint32 {
+		// This is the first element of the block!
+		itr.err = io.EOF
+		itr.pos = 0
+		return
+	}
+
+	// Move back using current header's prev.
+	itr.pos = itr.last.prev
+
+	var h header
+	y.AssertTruef(itr.pos < uint32(len(itr.data)), "%d %d", itr.pos, len(itr.data))
+	itr.pos += uint32(h.Decode(itr.data[itr.pos:]))
+	itr.parseKV(h)
+	itr.last = h
+}
+
+func (itr *blockIterator) Key() []byte {
+	if itr.err != nil {
+		return nil
+	}
+	return itr.key
+}
+
+func (itr *blockIterator) Value() []byte {
+	if itr.err != nil {
+		return nil
+	}
+	return itr.val
+}
+
+// Iterator is an iterator for a Table.
+type Iterator struct {
+	t    *Table
+	bpos int
+	bi   *blockIterator
+	err  error
+
+	// Internally, Iterator is bidirectional. However, we only expose the
+	// unidirectional functionality for now.
+	reversed bool
+}
+
+// NewIterator returns a new iterator of the Table
+func (t *Table) NewIterator(reversed bool) *Iterator {
+	t.IncrRef() // Important.
+	ti := &Iterator{t: t, reversed: reversed}
+	ti.next()
+	return ti
+}
+
+// Close closes the iterator (and it must be called).
+func (itr *Iterator) Close() error {
+	return itr.t.DecrRef()
+}
+
+func (itr *Iterator) reset() {
+	itr.bpos = 0
+	itr.err = nil
+}
+
+// Valid follows the y.Iterator interface
+func (itr *Iterator) Valid() bool {
+	return itr.err == nil
+}
+
+func (itr *Iterator) seekToFirst() {
+	numBlocks := len(itr.t.blockIndex)
+	if numBlocks == 0 {
+		itr.err = io.EOF
+		return
+	}
+	itr.bpos = 0
+	block, err := itr.t.block(itr.bpos)
+	if err != nil {
+		itr.err = err
+		return
+	}
+	itr.bi = block.NewIterator()
+	itr.bi.SeekToFirst()
+	itr.err = itr.bi.Error()
+}
+
+func (itr *Iterator) seekToLast() {
+	numBlocks := len(itr.t.blockIndex)
+	if numBlocks == 0 {
+		itr.err = io.EOF
+		return
+	}
+	itr.bpos = numBlocks - 1
+	block, err := itr.t.block(itr.bpos)
+	if err != nil {
+		itr.err = err
+		return
+	}
+	itr.bi = block.NewIterator()
+	itr.bi.SeekToLast()
+	itr.err = itr.bi.Error()
+}
+
+func (itr *Iterator) seekHelper(blockIdx int, key []byte) {
+	itr.bpos = blockIdx
+	block, err := itr.t.block(blockIdx)
+	if err != nil {
+		itr.err = err
+		return
+	}
+	itr.bi = block.NewIterator()
+	itr.bi.Seek(key, origin)
+	itr.err = itr.bi.Error()
+}
+
+// seekFrom brings us to a key that is >= input key.
+func (itr *Iterator) seekFrom(key []byte, whence int) {
+	itr.err = nil
+	switch whence {
+	case origin:
+		itr.reset()
+	case current:
+	}
+
+	idx := sort.Search(len(itr.t.blockIndex), func(idx int) bool {
+		ko := itr.t.blockIndex[idx]
+		return y.CompareKeys(ko.key, key) > 0
+	})
+	if idx == 0 {
+		// The smallest key in our table is already strictly > key. We can return that.
+		// This is like a SeekToFirst.
+		itr.seekHelper(0, key)
+		return
+	}
+
+	// block[idx].smallest is > key.
+	// Since idx>0, we know block[idx-1].smallest is <= key.
+	// There are two cases.
+	// 1) Everything in block[idx-1] is strictly < key. In this case, we should go to the first
+	//    element of block[idx].
+	// 2) Some element in block[idx-1] is >= key. We should go to that element.
+	itr.seekHelper(idx-1, key)
+	if itr.err == io.EOF {
+		// Case 1. Need to visit block[idx].
+		if idx == len(itr.t.blockIndex) {
+			// If idx == len(itr.t.blockIndex), then input key is greater than ANY element of table.
+			// There's nothing we can do. Valid() should return false as we seek to end of table.
+			return
+		}
+		// Since block[idx].smallest is > key. This is essentially a block[idx].SeekToFirst.
+		itr.seekHelper(idx, key)
+	}
+	// Case 2: No need to do anything. We already did the seek in block[idx-1].
+}
+
+// seek will reset iterator and seek to >= key.
+func (itr *Iterator) seek(key []byte) {
+	itr.seekFrom(key, origin)
+}
+
+// seekForPrev will reset iterator and seek to <= key.
+func (itr *Iterator) seekForPrev(key []byte) {
+	// TODO: Optimize this. We shouldn't have to take a Prev step.
+	itr.seekFrom(key, origin)
+	if !bytes.Equal(itr.Key(), key) {
+		itr.prev()
+	}
+}
+
+func (itr *Iterator) next() {
+	itr.err = nil
+
+	if itr.bpos >= len(itr.t.blockIndex) {
+		itr.err = io.EOF
+		return
+	}
+
+	if itr.bi == nil {
+		block, err := itr.t.block(itr.bpos)
+		if err != nil {
+			itr.err = err
+			return
+		}
+		itr.bi = block.NewIterator()
+		itr.bi.SeekToFirst()
+		itr.err = itr.bi.Error()
+		return
+	}
+
+	itr.bi.Next()
+	if !itr.bi.Valid() {
+		itr.bpos++
+		itr.bi = nil
+		itr.next()
+		return
+	}
+}
+
+func (itr *Iterator) prev() {
+	itr.err = nil
+	if itr.bpos < 0 {
+		itr.err = io.EOF
+		return
+	}
+
+	if itr.bi == nil {
+		block, err := itr.t.block(itr.bpos)
+		if err != nil {
+			itr.err = err
+			return
+		}
+		itr.bi = block.NewIterator()
+		itr.bi.SeekToLast()
+		itr.err = itr.bi.Error()
+		return
+	}
+
+	itr.bi.Prev()
+	if !itr.bi.Valid() {
+		itr.bpos--
+		itr.bi = nil
+		itr.prev()
+		return
+	}
+}
+
+// Key follows the y.Iterator interface
+func (itr *Iterator) Key() []byte {
+	return itr.bi.Key()
+}
+
+// Value follows the y.Iterator interface
+func (itr *Iterator) Value() (ret y.ValueStruct) {
+	ret.Decode(itr.bi.Value())
+	return
+}
+
+// Next follows the y.Iterator interface
+func (itr *Iterator) Next() {
+	if !itr.reversed {
+		itr.next()
+	} else {
+		itr.prev()
+	}
+}
+
+// Rewind follows the y.Iterator interface
+func (itr *Iterator) Rewind() {
+	if !itr.reversed {
+		itr.seekToFirst()
+	} else {
+		itr.seekToLast()
+	}
+}
+
+// Seek follows the y.Iterator interface
+func (itr *Iterator) Seek(key []byte) {
+	if !itr.reversed {
+		itr.seek(key)
+	} else {
+		itr.seekForPrev(key)
+	}
+}
+
+// ConcatIterator concatenates the sequences defined by several iterators.  (It only works with
+// TableIterators, probably just because it's faster to not be so generic.)
+type ConcatIterator struct {
+	idx      int // Which iterator is active now.
+	cur      *Iterator
+	iters    []*Iterator // Corresponds to tables.
+	tables   []*Table    // Disregarding reversed, this is in ascending order.
+	reversed bool
+}
+
+// NewConcatIterator creates a new concatenated iterator
+func NewConcatIterator(tbls []*Table, reversed bool) *ConcatIterator {
+	iters := make([]*Iterator, len(tbls))
+	for i := 0; i < len(tbls); i++ {
+		iters[i] = tbls[i].NewIterator(reversed)
+	}
+	return &ConcatIterator{
+		reversed: reversed,
+		iters:    iters,
+		tables:   tbls,
+		idx:      -1, // Not really necessary because s.it.Valid()=false, but good to have.
+	}
+}
+
+func (s *ConcatIterator) setIdx(idx int) {
+	s.idx = idx
+	if idx < 0 || idx >= len(s.iters) {
+		s.cur = nil
+	} else {
+		s.cur = s.iters[s.idx]
+	}
+}
+
+// Rewind implements y.Interface
+func (s *ConcatIterator) Rewind() {
+	if len(s.iters) == 0 {
+		return
+	}
+	if !s.reversed {
+		s.setIdx(0)
+	} else {
+		s.setIdx(len(s.iters) - 1)
+	}
+	s.cur.Rewind()
+}
+
+// Valid implements y.Interface
+func (s *ConcatIterator) Valid() bool {
+	return s.cur != nil && s.cur.Valid()
+}
+
+// Key implements y.Interface
+func (s *ConcatIterator) Key() []byte {
+	return s.cur.Key()
+}
+
+// Value implements y.Interface
+func (s *ConcatIterator) Value() y.ValueStruct {
+	return s.cur.Value()
+}
+
+// Seek brings us to element >= key if reversed is false. Otherwise, <= key.
+func (s *ConcatIterator) Seek(key []byte) {
+	var idx int
+	if !s.reversed {
+		idx = sort.Search(len(s.tables), func(i int) bool {
+			return y.CompareKeys(s.tables[i].Biggest(), key) >= 0
+		})
+	} else {
+		n := len(s.tables)
+		idx = n - 1 - sort.Search(n, func(i int) bool {
+			return y.CompareKeys(s.tables[n-1-i].Smallest(), key) <= 0
+		})
+	}
+	if idx >= len(s.tables) || idx < 0 {
+		s.setIdx(-1)
+		return
+	}
+	// For reversed=false, we know s.tables[i-1].Biggest() < key. Thus, the
+	// previous table cannot possibly contain key.
+	s.setIdx(idx)
+	s.cur.Seek(key)
+}
+
+// Next advances our concat iterator.
+func (s *ConcatIterator) Next() {
+	s.cur.Next()
+	if s.cur.Valid() {
+		// Nothing to do. Just stay with the current table.
+		return
+	}
+	for { // In case there are empty tables.
+		if !s.reversed {
+			s.setIdx(s.idx + 1)
+		} else {
+			s.setIdx(s.idx - 1)
+		}
+		if s.cur == nil {
+			// End of list. Valid will become false.
+			return
+		}
+		s.cur.Rewind()
+		if s.cur.Valid() {
+			break
+		}
+	}
+}
+
+// Close implements y.Interface.
+func (s *ConcatIterator) Close() error {
+	for _, it := range s.iters {
+		if err := it.Close(); err != nil {
+			return errors.Wrap(err, "ConcatIterator")
+		}
+	}
+	return nil
+}
--- a/vendor/github.com/dgraph-io/badger/table/table.go
+++ b/vendor/github.com/dgraph-io/badger/table/table.go
@@ -0,0 +1,359 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package table
+
+import (
+	"encoding/binary"
+	"fmt"
+	"os"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+
+	"github.com/AndreasBriese/bbloom"
+	"github.com/dgraph-io/badger/options"
+	"github.com/dgraph-io/badger/y"
+	"github.com/pkg/errors"
+)
+
+const fileSuffix = ".sst"
+
+type keyOffset struct {
+	key    []byte
+	offset int
+	len    int
+}
+
+// Table represents a loaded table file with the info we have about it
+type Table struct {
+	sync.Mutex
+
+	fd        *os.File // Own fd.
+	tableSize int      // Initialized in OpenTable, using fd.Stat().
+
+	blockIndex []keyOffset
+	ref        int32 // For file garbage collection.  Atomic.
+
+	loadingMode options.FileLoadingMode
+	mmap        []byte // Memory mapped.
+
+	// The following are initialized once and const.
+	smallest, biggest []byte // Smallest and largest keys.
+	id                uint64 // file id, part of filename
+
+	bf bbloom.Bloom
+}
+
+// IncrRef increments the refcount (having to do with whether the file should be deleted)
+func (t *Table) IncrRef() {
+	atomic.AddInt32(&t.ref, 1)
+}
+
+// DecrRef decrements the refcount and possibly deletes the table
+func (t *Table) DecrRef() error {
+	newRef := atomic.AddInt32(&t.ref, -1)
+	if newRef == 0 {
+		// We can safely delete this file, because for all the current files, we always have
+		// at least one reference pointing to them.
+
+		// It's necessary to delete windows files
+		if t.loadingMode == options.MemoryMap {
+			y.Munmap(t.mmap)
+		}
+		if err := t.fd.Truncate(0); err != nil {
+			// This is very important to let the FS know that the file is deleted.
+			return err
+		}
+		filename := t.fd.Name()
+		if err := t.fd.Close(); err != nil {
+			return err
+		}
+		if err := os.Remove(filename); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type block struct {
+	offset int
+	data   []byte
+}
+
+func (b block) NewIterator() *blockIterator {
+	return &blockIterator{data: b.data}
+}
+
+// OpenTable assumes file has only one table and opens it.  Takes ownership of fd upon function
+// entry.  Returns a table with one reference count on it (decrementing which may delete the file!
+// -- consider t.Close() instead).  The fd has to writeable because we call Truncate on it before
+// deleting.
+func OpenTable(fd *os.File, loadingMode options.FileLoadingMode) (*Table, error) {
+	fileInfo, err := fd.Stat()
+	if err != nil {
+		// It's OK to ignore fd.Close() errs in this function because we have only read
+		// from the file.
+		_ = fd.Close()
+		return nil, y.Wrap(err)
+	}
+
+	filename := fileInfo.Name()
+	id, ok := ParseFileID(filename)
+	if !ok {
+		_ = fd.Close()
+		return nil, errors.Errorf("Invalid filename: %s", filename)
+	}
+	t := &Table{
+		fd:          fd,
+		ref:         1, // Caller is given one reference.
+		id:          id,
+		loadingMode: loadingMode,
+	}
+
+	t.tableSize = int(fileInfo.Size())
+
+	if loadingMode == options.MemoryMap {
+		t.mmap, err = y.Mmap(fd, false, fileInfo.Size())
+		if err != nil {
+			_ = fd.Close()
+			return nil, y.Wrapf(err, "Unable to map file")
+		}
+	} else if loadingMode == options.LoadToRAM {
+		err = t.loadToRAM()
+		if err != nil {
+			_ = fd.Close()
+			return nil, y.Wrap(err)
+		}
+	}
+
+	if err := t.readIndex(); err != nil {
+		return nil, y.Wrap(err)
+	}
+
+	it := t.NewIterator(false)
+	defer it.Close()
+	it.Rewind()
+	if it.Valid() {
+		t.smallest = it.Key()
+	}
+
+	it2 := t.NewIterator(true)
+	defer it2.Close()
+	it2.Rewind()
+	if it2.Valid() {
+		t.biggest = it2.Key()
+	}
+	return t, nil
+}
+
+// Close closes the open table.  (Releases resources back to the OS.)
+func (t *Table) Close() error {
+	if t.loadingMode == options.MemoryMap {
+		y.Munmap(t.mmap)
+	}
+
+	return t.fd.Close()
+}
+
+func (t *Table) read(off int, sz int) ([]byte, error) {
+	if len(t.mmap) > 0 {
+		if len(t.mmap[off:]) < sz {
+			return nil, y.ErrEOF
+		}
+		return t.mmap[off : off+sz], nil
+	}
+
+	res := make([]byte, sz)
+	nbr, err := t.fd.ReadAt(res, int64(off))
+	y.NumReads.Add(1)
+	y.NumBytesRead.Add(int64(nbr))
+	return res, err
+}
+
+func (t *Table) readNoFail(off int, sz int) []byte {
+	res, err := t.read(off, sz)
+	y.Check(err)
+	return res
+}
+
+func (t *Table) readIndex() error {
+	readPos := t.tableSize
+
+	// Read bloom filter.
+	readPos -= 4
+	buf := t.readNoFail(readPos, 4)
+	bloomLen := int(binary.BigEndian.Uint32(buf))
+	readPos -= bloomLen
+	data := t.readNoFail(readPos, bloomLen)
+	t.bf = bbloom.JSONUnmarshal(data)
+
+	readPos -= 4
+	buf = t.readNoFail(readPos, 4)
+	restartsLen := int(binary.BigEndian.Uint32(buf))
+
+	readPos -= 4 * restartsLen
+	buf = t.readNoFail(readPos, 4*restartsLen)
+
+	offsets := make([]int, restartsLen)
+	for i := 0; i < restartsLen; i++ {
+		offsets[i] = int(binary.BigEndian.Uint32(buf[:4]))
+		buf = buf[4:]
+	}
+
+	// The last offset stores the end of the last block.
+	for i := 0; i < len(offsets); i++ {
+		var o int
+		if i == 0 {
+			o = 0
+		} else {
+			o = offsets[i-1]
+		}
+
+		ko := keyOffset{
+			offset: o,
+			len:    offsets[i] - o,
+		}
+		t.blockIndex = append(t.blockIndex, ko)
+	}
+
+	che := make(chan error, len(t.blockIndex))
+	blocks := make(chan int, len(t.blockIndex))
+
+	for i := 0; i < len(t.blockIndex); i++ {
+		blocks <- i
+	}
+
+	for i := 0; i < 64; i++ { // Run 64 goroutines.
+		go func() {
+			var h header
+
+			for index := range blocks {
+				ko := &t.blockIndex[index]
+
+				offset := ko.offset
+				buf, err := t.read(offset, h.Size())
+				if err != nil {
+					che <- errors.Wrap(err, "While reading first header in block")
+					continue
+				}
+
+				h.Decode(buf)
+				y.AssertTruef(h.plen == 0, "Key offset: %+v, h.plen = %d", *ko, h.plen)
+
+				offset += h.Size()
+				buf = make([]byte, h.klen)
+				var out []byte
+				if out, err = t.read(offset, int(h.klen)); err != nil {
+					che <- errors.Wrap(err, "While reading first key in block")
+					continue
+				}
+				y.AssertTrue(len(buf) == copy(buf, out))
+
+				ko.key = buf
+				che <- nil
+			}
+		}()
+	}
+	close(blocks) // to stop reading goroutines
+
+	var readError error
+	for i := 0; i < len(t.blockIndex); i++ {
+		if err := <-che; err != nil && readError == nil {
+			readError = err
+		}
+	}
+	if readError != nil {
+		return readError
+	}
+
+	return nil
+}
+
+func (t *Table) block(idx int) (block, error) {
+	y.AssertTruef(idx >= 0, "idx=%d", idx)
+	if idx >= len(t.blockIndex) {
+		return block{}, errors.New("block out of index")
+	}
+
+	ko := t.blockIndex[idx]
+	blk := block{
+		offset: ko.offset,
+	}
+	var err error
+	blk.data, err = t.read(blk.offset, ko.len)
+	return blk, err
+}
+
+// Size is its file size in bytes
+func (t *Table) Size() int64 { return int64(t.tableSize) }
+
+// Smallest is its smallest key, or nil if there are none
+func (t *Table) Smallest() []byte { return t.smallest }
+
+// Biggest is its biggest key, or nil if there are none
+func (t *Table) Biggest() []byte { return t.biggest }
+
+// Filename is NOT the file name.  Just kidding, it is.
+func (t *Table) Filename() string { return t.fd.Name() }
+
+// ID is the table's ID number (used to make the file name).
+func (t *Table) ID() uint64 { return t.id }
+
+// DoesNotHave returns true if (but not "only if") the table does not have the key.  It does a
+// bloom filter lookup.
+func (t *Table) DoesNotHave(key []byte) bool { return !t.bf.Has(key) }
+
+// ParseFileID reads the file id out of a filename.
+func ParseFileID(name string) (uint64, bool) {
+	name = path.Base(name)
+	if !strings.HasSuffix(name, fileSuffix) {
+		return 0, false
+	}
+	//	suffix := name[len(fileSuffix):]
+	name = strings.TrimSuffix(name, fileSuffix)
+	id, err := strconv.Atoi(name)
+	if err != nil {
+		return 0, false
+	}
+	y.AssertTrue(id >= 0)
+	return uint64(id), true
+}
+
+// IDToFilename does the inverse of ParseFileID
+func IDToFilename(id uint64) string {
+	return fmt.Sprintf("%06d", id) + fileSuffix
+}
+
+// NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table
+// filepath.
+func NewFilename(id uint64, dir string) string {
+	return filepath.Join(dir, IDToFilename(id))
+}
+
+func (t *Table) loadToRAM() error {
+	t.mmap = make([]byte, t.tableSize)
+	read, err := t.fd.ReadAt(t.mmap, 0)
+	if err != nil || read != t.tableSize {
+		return y.Wrapf(err, "Unable to load file in memory. Table file: %s", t.Filename())
+	}
+	y.NumReads.Add(1)
+	y.NumBytesRead.Add(int64(read))
+	return nil
+}