feat: Add badgerDB support. (#353)
See https://github.com/dgraph-io/badger Slide: https://github.com/gopherchina/conference/blob/master/2018/1.5%20Badger_%20Fast%20Key-Value%20DB%20in%20Go.pdf
This commit is contained in:
parent
069efa9e5e
commit
ffa8eb12b3
|
@ -60,7 +60,7 @@ A push notification micro server using [Gin](https://github.com/gin-gonic/gin) f
|
|||
* Support notification queue and multiple workers.
|
||||
* Support `/api/stat/app` show notification success and failure counts.
|
||||
* Support `/api/config` show your [YAML](https://en.wikipedia.org/wiki/YAML) config.
|
||||
* Support store app stat to memory, [Redis](http://redis.io/), [BoltDB](https://github.com/boltdb/bolt), [BuntDB](https://github.com/tidwall/buntdb) or [LevelDB](https://github.com/syndtr/goleveldb).
|
||||
* Support store app stat to memory, [Redis](http://redis.io/), [BoltDB](https://github.com/boltdb/bolt), [BuntDB](https://github.com/tidwall/buntdb), [LevelDB](https://github.com/syndtr/goleveldb) or [BadgerDB](https://github.com/dgraph-io/badger).
|
||||
* Support `p8`, `p12` or `pem` format of iOS certificate file.
|
||||
* Support `/sys/stats` show response time, status code count, etc.
|
||||
* Support for HTTP proxy to Google server (FCM).
|
||||
|
|
|
@ -4,6 +4,7 @@ import (
|
|||
"errors"
|
||||
"net/http"
|
||||
|
||||
"github.com/appleboy/gorush/storage/badger"
|
||||
"github.com/appleboy/gorush/storage/boltdb"
|
||||
"github.com/appleboy/gorush/storage/buntdb"
|
||||
"github.com/appleboy/gorush/storage/leveldb"
|
||||
|
@ -52,6 +53,8 @@ func InitAppStatus() error {
|
|||
StatStorage = buntdb.New(PushConf)
|
||||
case "leveldb":
|
||||
StatStorage = leveldb.New(PushConf)
|
||||
case "badger":
|
||||
StatStorage = badger.New(PushConf)
|
||||
default:
|
||||
LogError.Error("storage error: can't find storage driver")
|
||||
return errors.New("can't find storage driver")
|
||||
|
|
|
@ -187,3 +187,29 @@ func TestStatForBuntDBEngine(t *testing.T) {
|
|||
// val = StatStorage.GetAndroidError()
|
||||
// assert.Equal(t, int64(500), val)
|
||||
// }
|
||||
|
||||
func TestStatForBadgerEngine(t *testing.T) {
|
||||
var val int64
|
||||
PushConf.Stat.Engine = "badger"
|
||||
err := InitAppStatus()
|
||||
assert.Nil(t, err)
|
||||
|
||||
StatStorage.Reset()
|
||||
|
||||
StatStorage.AddTotalCount(100)
|
||||
StatStorage.AddIosSuccess(200)
|
||||
StatStorage.AddIosError(300)
|
||||
StatStorage.AddAndroidSuccess(400)
|
||||
StatStorage.AddAndroidError(500)
|
||||
|
||||
val = StatStorage.GetTotalCount()
|
||||
assert.Equal(t, int64(100), val)
|
||||
val = StatStorage.GetIosSuccess()
|
||||
assert.Equal(t, int64(200), val)
|
||||
val = StatStorage.GetIosError()
|
||||
assert.Equal(t, int64(300), val)
|
||||
val = StatStorage.GetAndroidSuccess()
|
||||
assert.Equal(t, int64(400), val)
|
||||
val = StatStorage.GetAndroidError()
|
||||
assert.Equal(t, int64(500), val)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,183 @@
|
|||
package badger
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/appleboy/gorush/config"
|
||||
"github.com/appleboy/gorush/storage"
|
||||
|
||||
"github.com/appleboy/com/convert"
|
||||
"github.com/dgraph-io/badger"
|
||||
)
|
||||
|
||||
// New func implements the storage interface for gorush (https://github.com/appleboy/gorush)
|
||||
func New(config config.ConfYaml) *Storage {
|
||||
return &Storage{
|
||||
config: config,
|
||||
}
|
||||
}
|
||||
|
||||
// Storage is interface structure
|
||||
type Storage struct {
|
||||
config config.ConfYaml
|
||||
opts badger.Options
|
||||
name string
|
||||
}
|
||||
|
||||
// Init client storage.
|
||||
func (s *Storage) Init() error {
|
||||
s.name = "bager"
|
||||
s.opts = badger.DefaultOptions
|
||||
s.opts.Dir = os.TempDir() + "badger"
|
||||
s.opts.ValueDir = os.TempDir() + "badger"
|
||||
fmt.Println(s.opts.Dir)
|
||||
fmt.Println(s.opts.ValueDir)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Reset Client storage.
|
||||
func (s *Storage) Reset() {
|
||||
s.setBadger(storage.TotalCountKey, 0)
|
||||
s.setBadger(storage.IosSuccessKey, 0)
|
||||
s.setBadger(storage.IosErrorKey, 0)
|
||||
s.setBadger(storage.AndroidSuccessKey, 0)
|
||||
s.setBadger(storage.AndroidErrorKey, 0)
|
||||
}
|
||||
|
||||
func (s *Storage) setBadger(key string, count int64) {
|
||||
db, err := badger.Open(s.opts)
|
||||
|
||||
if err != nil {
|
||||
log.Println(s.name, "open error:", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
defer func() {
|
||||
err := db.Close()
|
||||
if err != nil {
|
||||
log.Println(s.name, "close error:", err.Error())
|
||||
}
|
||||
}()
|
||||
|
||||
err = db.Update(func(txn *badger.Txn) error {
|
||||
value := convert.ToString(count).(string)
|
||||
return txn.Set([]byte(key), []byte(value))
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Println(s.name, "update error:", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Storage) getBager(key string, count *int64) {
|
||||
db, err := badger.Open(s.opts)
|
||||
|
||||
if err != nil {
|
||||
log.Println(s.name, "open error:", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
defer func() {
|
||||
err := db.Close()
|
||||
if err != nil {
|
||||
log.Println(s.name, "close error:", err.Error())
|
||||
}
|
||||
}()
|
||||
|
||||
err = db.View(func(txn *badger.Txn) error {
|
||||
item, err := txn.Get([]byte(key))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
val, err := item.Value()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
i, err := strconv.ParseInt(fmt.Sprintf("%s", val), 10, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
*count = i
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Println(s.name, "get error:", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// AddTotalCount record push notification count.
|
||||
func (s *Storage) AddTotalCount(count int64) {
|
||||
total := s.GetTotalCount() + count
|
||||
s.setBadger(storage.TotalCountKey, total)
|
||||
}
|
||||
|
||||
// AddIosSuccess record counts of success iOS push notification.
|
||||
func (s *Storage) AddIosSuccess(count int64) {
|
||||
total := s.GetIosSuccess() + count
|
||||
s.setBadger(storage.IosSuccessKey, total)
|
||||
}
|
||||
|
||||
// AddIosError record counts of error iOS push notification.
|
||||
func (s *Storage) AddIosError(count int64) {
|
||||
total := s.GetIosError() + count
|
||||
s.setBadger(storage.IosErrorKey, total)
|
||||
}
|
||||
|
||||
// AddAndroidSuccess record counts of success Android push notification.
|
||||
func (s *Storage) AddAndroidSuccess(count int64) {
|
||||
total := s.GetAndroidSuccess() + count
|
||||
s.setBadger(storage.AndroidSuccessKey, total)
|
||||
}
|
||||
|
||||
// AddAndroidError record counts of error Android push notification.
|
||||
func (s *Storage) AddAndroidError(count int64) {
|
||||
total := s.GetAndroidError() + count
|
||||
s.setBadger(storage.AndroidErrorKey, total)
|
||||
}
|
||||
|
||||
// GetTotalCount show counts of all notification.
|
||||
func (s *Storage) GetTotalCount() int64 {
|
||||
var count int64
|
||||
s.getBager(storage.TotalCountKey, &count)
|
||||
|
||||
return count
|
||||
}
|
||||
|
||||
// GetIosSuccess show success counts of iOS notification.
|
||||
func (s *Storage) GetIosSuccess() int64 {
|
||||
var count int64
|
||||
s.getBager(storage.IosSuccessKey, &count)
|
||||
|
||||
return count
|
||||
}
|
||||
|
||||
// GetIosError show error counts of iOS notification.
|
||||
func (s *Storage) GetIosError() int64 {
|
||||
var count int64
|
||||
s.getBager(storage.IosErrorKey, &count)
|
||||
|
||||
return count
|
||||
}
|
||||
|
||||
// GetAndroidSuccess show success counts of Android notification.
|
||||
func (s *Storage) GetAndroidSuccess() int64 {
|
||||
var count int64
|
||||
s.getBager(storage.AndroidSuccessKey, &count)
|
||||
|
||||
return count
|
||||
}
|
||||
|
||||
// GetAndroidError show error counts of Android notification.
|
||||
func (s *Storage) GetAndroidError() int64 {
|
||||
var count int64
|
||||
s.getBager(storage.AndroidErrorKey, &count)
|
||||
|
||||
return count
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package badger
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
c "github.com/appleboy/gorush/config"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestBadgerEngine(t *testing.T) {
|
||||
var val int64
|
||||
|
||||
config, _ := c.LoadConf("")
|
||||
|
||||
badger := New(config)
|
||||
err := badger.Init()
|
||||
assert.Nil(t, err)
|
||||
badger.Reset()
|
||||
|
||||
badger.AddTotalCount(10)
|
||||
val = badger.GetTotalCount()
|
||||
assert.Equal(t, int64(10), val)
|
||||
badger.AddTotalCount(10)
|
||||
val = badger.GetTotalCount()
|
||||
assert.Equal(t, int64(20), val)
|
||||
|
||||
badger.AddIosSuccess(20)
|
||||
val = badger.GetIosSuccess()
|
||||
assert.Equal(t, int64(20), val)
|
||||
|
||||
badger.AddIosError(30)
|
||||
val = badger.GetIosError()
|
||||
assert.Equal(t, int64(30), val)
|
||||
|
||||
badger.AddAndroidSuccess(40)
|
||||
val = badger.GetAndroidSuccess()
|
||||
assert.Equal(t, int64(40), val)
|
||||
|
||||
badger.AddAndroidError(50)
|
||||
val = badger.GetAndroidError()
|
||||
assert.Equal(t, int64(50), val)
|
||||
|
||||
// test reset db
|
||||
badger.Reset()
|
||||
val = badger.GetAndroidError()
|
||||
assert.Equal(t, int64(0), val)
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
bbloom.go
|
||||
|
||||
// The MIT License (MIT)
|
||||
// Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt
|
||||
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
// this software and associated documentation files (the "Software"), to deal in
|
||||
// the Software without restriction, including without limitation the rights to
|
||||
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
// the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
// subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
siphash.go
|
||||
|
||||
// https://github.com/dchest/siphash
|
||||
//
|
||||
// Written in 2012 by Dmitry Chestnykh.
|
||||
//
|
||||
// To the extent possible under law, the author have dedicated all copyright
|
||||
// and related and neighboring rights to this software to the public domain
|
||||
// worldwide. This software is distributed without any warranty.
|
||||
// http://creativecommons.org/publicdomain/zero/1.0/
|
||||
//
|
||||
// Package siphash implements SipHash-2-4, a fast short-input PRF
|
||||
// created by Jean-Philippe Aumasson and Daniel J. Bernstein.
|
|
@ -0,0 +1,129 @@
|
|||
## bbloom: a bitset Bloom filter for go/golang
|
||||
===
|
||||
|
||||
package implements a fast bloom filter with real 'bitset' and JSONMarshal/JSONUnmarshal to store/reload the Bloom filter.
|
||||
|
||||
NOTE: the package uses unsafe.Pointer to set and read the bits from the bitset. If you're uncomfortable with using the unsafe package, please consider using my bloom filter package at github.com/AndreasBriese/bloom
|
||||
|
||||
===
|
||||
|
||||
changelog 11/2015: new thread safe methods AddTS(), HasTS(), AddIfNotHasTS() following a suggestion from Srdjan Marinovic (github @a-little-srdjan), who used this to code a bloomfilter cache.
|
||||
|
||||
This bloom filter was developed to strengthen a website-log database and was tested and optimized for this log-entry mask: "2014/%02i/%02i %02i:%02i:%02i /info.html".
|
||||
Nonetheless bbloom should work with any other form of entries.
|
||||
|
||||
~~Hash function is a modified Berkeley DB sdbm hash (to optimize for smaller strings). sdbm http://www.cse.yorku.ca/~oz/hash.html~~
|
||||
|
||||
Found sipHash (SipHash-2-4, a fast short-input PRF created by Jean-Philippe Aumasson and Daniel J. Bernstein.) to be about as fast. sipHash had been ported by Dimtry Chestnyk to Go (github.com/dchest/siphash )
|
||||
|
||||
Minimum hashset size is: 512 ([4]uint64; will be set automatically).
|
||||
|
||||
###install
|
||||
|
||||
```sh
|
||||
go get github.com/AndreasBriese/bbloom
|
||||
```
|
||||
|
||||
###test
|
||||
+ change to folder ../bbloom
|
||||
+ create wordlist in file "words.txt" (you might use `python permut.py`)
|
||||
+ run 'go test -bench=.' within the folder
|
||||
|
||||
```go
|
||||
go test -bench=.
|
||||
```
|
||||
|
||||
~~If you've installed the GOCONVEY TDD-framework http://goconvey.co/ you can run the tests automatically.~~
|
||||
|
||||
using go's testing framework now (have in mind that the op timing is related to 65536 operations of Add, Has, AddIfNotHas respectively)
|
||||
|
||||
### usage
|
||||
|
||||
after installation add
|
||||
|
||||
```go
|
||||
import (
|
||||
...
|
||||
"github.com/AndreasBriese/bbloom"
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
at your header. In the program use
|
||||
|
||||
```go
|
||||
// create a bloom filter for 65536 items and 1 % wrong-positive ratio
|
||||
bf := bbloom.New(float64(1<<16), float64(0.01))
|
||||
|
||||
// or
|
||||
// create a bloom filter with 650000 for 65536 items and 7 locs per hash explicitly
|
||||
// bf = bbloom.New(float64(650000), float64(7))
|
||||
// or
|
||||
bf = bbloom.New(650000.0, 7.0)
|
||||
|
||||
// add one item
|
||||
bf.Add([]byte("butter"))
|
||||
|
||||
// Number of elements added is exposed now
|
||||
// Note: ElemNum will not be included in JSON export (for compatability to older version)
|
||||
nOfElementsInFilter := bf.ElemNum
|
||||
|
||||
// check if item is in the filter
|
||||
isIn := bf.Has([]byte("butter")) // should be true
|
||||
isNotIn := bf.Has([]byte("Butter")) // should be false
|
||||
|
||||
// 'add only if item is new' to the bloomfilter
|
||||
added := bf.AddIfNotHas([]byte("butter")) // should be false because 'butter' is already in the set
|
||||
added = bf.AddIfNotHas([]byte("buTTer")) // should be true because 'buTTer' is new
|
||||
|
||||
// thread safe versions for concurrent use: AddTS, HasTS, AddIfNotHasTS
|
||||
// add one item
|
||||
bf.AddTS([]byte("peanutbutter"))
|
||||
// check if item is in the filter
|
||||
isIn = bf.HasTS([]byte("peanutbutter")) // should be true
|
||||
isNotIn = bf.HasTS([]byte("peanutButter")) // should be false
|
||||
// 'add only if item is new' to the bloomfilter
|
||||
added = bf.AddIfNotHasTS([]byte("butter")) // should be false because 'peanutbutter' is already in the set
|
||||
added = bf.AddIfNotHasTS([]byte("peanutbuTTer")) // should be true because 'penutbuTTer' is new
|
||||
|
||||
// convert to JSON ([]byte)
|
||||
Json := bf.JSONMarshal()
|
||||
|
||||
// bloomfilters Mutex is exposed for external un-/locking
|
||||
// i.e. mutex lock while doing JSON conversion
|
||||
bf.Mtx.Lock()
|
||||
Json = bf.JSONMarshal()
|
||||
bf.Mtx.Unlock()
|
||||
|
||||
// restore a bloom filter from storage
|
||||
bfNew := bbloom.JSONUnmarshal(Json)
|
||||
|
||||
isInNew := bfNew.Has([]byte("butter")) // should be true
|
||||
isNotInNew := bfNew.Has([]byte("Butter")) // should be false
|
||||
|
||||
```
|
||||
|
||||
to work with the bloom filter.
|
||||
|
||||
### why 'fast'?
|
||||
|
||||
It's about 3 times faster than William Fitzgeralds bitset bloom filter https://github.com/willf/bloom . And it is about so fast as my []bool set variant for Boom filters (see https://github.com/AndreasBriese/bloom ) but having a 8times smaller memory footprint:
|
||||
|
||||
|
||||
Bloom filter (filter size 524288, 7 hashlocs)
|
||||
github.com/AndreasBriese/bbloom 'Add' 65536 items (10 repetitions): 6595800 ns (100 ns/op)
|
||||
github.com/AndreasBriese/bbloom 'Has' 65536 items (10 repetitions): 5986600 ns (91 ns/op)
|
||||
github.com/AndreasBriese/bloom 'Add' 65536 items (10 repetitions): 6304684 ns (96 ns/op)
|
||||
github.com/AndreasBriese/bloom 'Has' 65536 items (10 repetitions): 6568663 ns (100 ns/op)
|
||||
|
||||
github.com/willf/bloom 'Add' 65536 items (10 repetitions): 24367224 ns (371 ns/op)
|
||||
github.com/willf/bloom 'Test' 65536 items (10 repetitions): 21881142 ns (333 ns/op)
|
||||
github.com/dataence/bloom/standard 'Add' 65536 items (10 repetitions): 23041644 ns (351 ns/op)
|
||||
github.com/dataence/bloom/standard 'Check' 65536 items (10 repetitions): 19153133 ns (292 ns/op)
|
||||
github.com/cabello/bloom 'Add' 65536 items (10 repetitions): 131921507 ns (2012 ns/op)
|
||||
github.com/cabello/bloom 'Contains' 65536 items (10 repetitions): 131108962 ns (2000 ns/op)
|
||||
|
||||
(on MBPro15 OSX10.8.5 i7 4Core 2.4Ghz)
|
||||
|
||||
|
||||
With 32bit bloom filters (bloom32) using modified sdbm, bloom32 does hashing with only 2 bit shifts, one xor and one substraction per byte. smdb is about as fast as fnv64a but gives less collisions with the dataset (see mask above). bloom.New(float64(10 * 1<<16),float64(7)) populated with 1<<16 random items from the dataset (see above) and tested against the rest results in less than 0.05% collisions.
|
|
@ -0,0 +1,270 @@
|
|||
// The MIT License (MIT)
|
||||
// Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt
|
||||
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
// this software and associated documentation files (the "Software"), to deal in
|
||||
// the Software without restriction, including without limitation the rights to
|
||||
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
// the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
// subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
package bbloom
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// helper
|
||||
var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128}
|
||||
|
||||
func getSize(ui64 uint64) (size uint64, exponent uint64) {
|
||||
if ui64 < uint64(512) {
|
||||
ui64 = uint64(512)
|
||||
}
|
||||
size = uint64(1)
|
||||
for size < ui64 {
|
||||
size <<= 1
|
||||
exponent++
|
||||
}
|
||||
return size, exponent
|
||||
}
|
||||
|
||||
func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) {
|
||||
size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2)
|
||||
locs := math.Ceil(float64(0.69314718056) * size / numEntries)
|
||||
return uint64(size), uint64(locs)
|
||||
}
|
||||
|
||||
// New
|
||||
// returns a new bloomfilter
|
||||
func New(params ...float64) (bloomfilter Bloom) {
|
||||
var entries, locs uint64
|
||||
if len(params) == 2 {
|
||||
if params[1] < 1 {
|
||||
entries, locs = calcSizeByWrongPositives(params[0], params[1])
|
||||
} else {
|
||||
entries, locs = uint64(params[0]), uint64(params[1])
|
||||
}
|
||||
} else {
|
||||
log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))")
|
||||
}
|
||||
size, exponent := getSize(uint64(entries))
|
||||
bloomfilter = Bloom{
|
||||
sizeExp: exponent,
|
||||
size: size - 1,
|
||||
setLocs: locs,
|
||||
shift: 64 - exponent,
|
||||
}
|
||||
bloomfilter.Size(size)
|
||||
return bloomfilter
|
||||
}
|
||||
|
||||
// NewWithBoolset
|
||||
// takes a []byte slice and number of locs per entry
|
||||
// returns the bloomfilter with a bitset populated according to the input []byte
|
||||
func NewWithBoolset(bs *[]byte, locs uint64) (bloomfilter Bloom) {
|
||||
bloomfilter = New(float64(len(*bs)<<3), float64(locs))
|
||||
ptr := uintptr(unsafe.Pointer(&bloomfilter.bitset[0]))
|
||||
for _, b := range *bs {
|
||||
*(*uint8)(unsafe.Pointer(ptr)) = b
|
||||
ptr++
|
||||
}
|
||||
return bloomfilter
|
||||
}
|
||||
|
||||
// bloomJSONImExport
|
||||
// Im/Export structure used by JSONMarshal / JSONUnmarshal
|
||||
type bloomJSONImExport struct {
|
||||
FilterSet []byte
|
||||
SetLocs uint64
|
||||
}
|
||||
|
||||
// JSONUnmarshal
|
||||
// takes JSON-Object (type bloomJSONImExport) as []bytes
|
||||
// returns bloom32 / bloom64 object
|
||||
func JSONUnmarshal(dbData []byte) Bloom {
|
||||
bloomImEx := bloomJSONImExport{}
|
||||
json.Unmarshal(dbData, &bloomImEx)
|
||||
buf := bytes.NewBuffer(bloomImEx.FilterSet)
|
||||
bs := buf.Bytes()
|
||||
bf := NewWithBoolset(&bs, bloomImEx.SetLocs)
|
||||
return bf
|
||||
}
|
||||
|
||||
//
|
||||
// Bloom filter
|
||||
type Bloom struct {
|
||||
Mtx sync.Mutex
|
||||
ElemNum uint64
|
||||
bitset []uint64
|
||||
sizeExp uint64
|
||||
size uint64
|
||||
setLocs uint64
|
||||
shift uint64
|
||||
}
|
||||
|
||||
// <--- http://www.cse.yorku.ca/~oz/hash.html
|
||||
// modified Berkeley DB Hash (32bit)
|
||||
// hash is casted to l, h = 16bit fragments
|
||||
// func (bl Bloom) absdbm(b *[]byte) (l, h uint64) {
|
||||
// hash := uint64(len(*b))
|
||||
// for _, c := range *b {
|
||||
// hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash
|
||||
// }
|
||||
// h = hash >> bl.shift
|
||||
// l = hash << bl.shift >> bl.shift
|
||||
// return l, h
|
||||
// }
|
||||
|
||||
// Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm()
|
||||
// https://131002.net/siphash/
|
||||
// siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash
|
||||
|
||||
// Add
|
||||
// set the bit(s) for entry; Adds an entry to the Bloom filter
|
||||
func (bl *Bloom) Add(entry []byte) {
|
||||
l, h := bl.sipHash(entry)
|
||||
for i := uint64(0); i < (*bl).setLocs; i++ {
|
||||
(*bl).Set((h + i*l) & (*bl).size)
|
||||
(*bl).ElemNum++
|
||||
}
|
||||
}
|
||||
|
||||
// AddTS
|
||||
// Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
|
||||
func (bl *Bloom) AddTS(entry []byte) {
|
||||
bl.Mtx.Lock()
|
||||
defer bl.Mtx.Unlock()
|
||||
bl.Add(entry[:])
|
||||
}
|
||||
|
||||
// Has
|
||||
// check if bit(s) for entry is/are set
|
||||
// returns true if the entry was added to the Bloom Filter
|
||||
func (bl Bloom) Has(entry []byte) bool {
|
||||
l, h := bl.sipHash(entry)
|
||||
for i := uint64(0); i < bl.setLocs; i++ {
|
||||
switch bl.IsSet((h + i*l) & bl.size) {
|
||||
case false:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// HasTS
|
||||
// Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
|
||||
func (bl *Bloom) HasTS(entry []byte) bool {
|
||||
bl.Mtx.Lock()
|
||||
defer bl.Mtx.Unlock()
|
||||
return bl.Has(entry[:])
|
||||
}
|
||||
|
||||
// AddIfNotHas
|
||||
// Only Add entry if it's not present in the bloomfilter
|
||||
// returns true if entry was added
|
||||
// returns false if entry was allready registered in the bloomfilter
|
||||
func (bl Bloom) AddIfNotHas(entry []byte) (added bool) {
|
||||
if bl.Has(entry[:]) {
|
||||
return added
|
||||
}
|
||||
bl.Add(entry[:])
|
||||
return true
|
||||
}
|
||||
|
||||
// AddIfNotHasTS
|
||||
// Tread safe: Only Add entry if it's not present in the bloomfilter
|
||||
// returns true if entry was added
|
||||
// returns false if entry was allready registered in the bloomfilter
|
||||
func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) {
|
||||
bl.Mtx.Lock()
|
||||
defer bl.Mtx.Unlock()
|
||||
return bl.AddIfNotHas(entry[:])
|
||||
}
|
||||
|
||||
// Size
|
||||
// make Bloom filter with as bitset of size sz
|
||||
func (bl *Bloom) Size(sz uint64) {
|
||||
(*bl).bitset = make([]uint64, sz>>6)
|
||||
}
|
||||
|
||||
// Clear
|
||||
// resets the Bloom filter
|
||||
func (bl *Bloom) Clear() {
|
||||
for i, _ := range (*bl).bitset {
|
||||
(*bl).bitset[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Set
|
||||
// set the bit[idx] of bitsit
|
||||
func (bl *Bloom) Set(idx uint64) {
|
||||
ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
|
||||
*(*uint8)(ptr) |= mask[idx%8]
|
||||
}
|
||||
|
||||
// IsSet
|
||||
// check if bit[idx] of bitset is set
|
||||
// returns true/false
|
||||
func (bl *Bloom) IsSet(idx uint64) bool {
|
||||
ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
|
||||
r := ((*(*uint8)(ptr)) >> (idx % 8)) & 1
|
||||
return r == 1
|
||||
}
|
||||
|
||||
// JSONMarshal
|
||||
// returns JSON-object (type bloomJSONImExport) as []byte
|
||||
func (bl Bloom) JSONMarshal() []byte {
|
||||
bloomImEx := bloomJSONImExport{}
|
||||
bloomImEx.SetLocs = uint64(bl.setLocs)
|
||||
bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3)
|
||||
ptr := uintptr(unsafe.Pointer(&bl.bitset[0]))
|
||||
for i := range bloomImEx.FilterSet {
|
||||
bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(ptr))
|
||||
ptr++
|
||||
}
|
||||
data, err := json.Marshal(bloomImEx)
|
||||
if err != nil {
|
||||
log.Fatal("json.Marshal failed: ", err)
|
||||
}
|
||||
return data
|
||||
}
|
||||
|
||||
// // alternative hashFn
|
||||
// func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) {
|
||||
// h64 := fnv.New64a()
|
||||
// h64.Write(*b)
|
||||
// hash := h64.Sum64()
|
||||
// h = hash >> 32
|
||||
// l = hash << 32 >> 32
|
||||
// return l, h
|
||||
// }
|
||||
//
|
||||
// // <-- http://partow.net/programming/hashfunctions/index.html
|
||||
// // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3,
|
||||
// // under the topic of sorting and search chapter 6.4.
|
||||
// // modified to fit with boolset-length
|
||||
// func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) {
|
||||
// hash := uint64(len(*b))
|
||||
// for _, c := range *b {
|
||||
// hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c)
|
||||
// }
|
||||
// h = hash >> bl.shift
|
||||
// l = hash << bl.sizeExp >> bl.sizeExp
|
||||
// return l, h
|
||||
// }
|
|
@ -0,0 +1,225 @@
|
|||
// Written in 2012 by Dmitry Chestnykh.
|
||||
//
|
||||
// To the extent possible under law, the author have dedicated all copyright
|
||||
// and related and neighboring rights to this software to the public domain
|
||||
// worldwide. This software is distributed without any warranty.
|
||||
// http://creativecommons.org/publicdomain/zero/1.0/
|
||||
//
|
||||
// Package siphash implements SipHash-2-4, a fast short-input PRF
|
||||
// created by Jean-Philippe Aumasson and Daniel J. Bernstein.
|
||||
|
||||
package bbloom
|
||||
|
||||
// Hash returns the 64-bit SipHash-2-4 of the given byte slice with two 64-bit
|
||||
// parts of 128-bit key: k0 and k1.
|
||||
func (bl Bloom) sipHash(p []byte) (l, h uint64) {
|
||||
// Initialization.
|
||||
v0 := uint64(8317987320269560794) // k0 ^ 0x736f6d6570736575
|
||||
v1 := uint64(7237128889637516672) // k1 ^ 0x646f72616e646f6d
|
||||
v2 := uint64(7816392314733513934) // k0 ^ 0x6c7967656e657261
|
||||
v3 := uint64(8387220255325274014) // k1 ^ 0x7465646279746573
|
||||
t := uint64(len(p)) << 56
|
||||
|
||||
// Compression.
|
||||
for len(p) >= 8 {
|
||||
|
||||
m := uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 |
|
||||
uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
|
||||
|
||||
v3 ^= m
|
||||
|
||||
// Round 1.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
// Round 2.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
v0 ^= m
|
||||
p = p[8:]
|
||||
}
|
||||
|
||||
// Compress last block.
|
||||
switch len(p) {
|
||||
case 7:
|
||||
t |= uint64(p[6]) << 48
|
||||
fallthrough
|
||||
case 6:
|
||||
t |= uint64(p[5]) << 40
|
||||
fallthrough
|
||||
case 5:
|
||||
t |= uint64(p[4]) << 32
|
||||
fallthrough
|
||||
case 4:
|
||||
t |= uint64(p[3]) << 24
|
||||
fallthrough
|
||||
case 3:
|
||||
t |= uint64(p[2]) << 16
|
||||
fallthrough
|
||||
case 2:
|
||||
t |= uint64(p[1]) << 8
|
||||
fallthrough
|
||||
case 1:
|
||||
t |= uint64(p[0])
|
||||
}
|
||||
|
||||
v3 ^= t
|
||||
|
||||
// Round 1.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
// Round 2.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
v0 ^= t
|
||||
|
||||
// Finalization.
|
||||
v2 ^= 0xff
|
||||
|
||||
// Round 1.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
// Round 2.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
// Round 3.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
// Round 4.
|
||||
v0 += v1
|
||||
v1 = v1<<13 | v1>>51
|
||||
v1 ^= v0
|
||||
v0 = v0<<32 | v0>>32
|
||||
|
||||
v2 += v3
|
||||
v3 = v3<<16 | v3>>48
|
||||
v3 ^= v2
|
||||
|
||||
v0 += v3
|
||||
v3 = v3<<21 | v3>>43
|
||||
v3 ^= v0
|
||||
|
||||
v2 += v1
|
||||
v1 = v1<<17 | v1>>47
|
||||
v1 ^= v2
|
||||
v2 = v2<<32 | v2>>32
|
||||
|
||||
// return v0 ^ v1 ^ v2 ^ v3
|
||||
|
||||
hash := v0 ^ v1 ^ v2 ^ v3
|
||||
h = hash >> bl.shift
|
||||
l = hash << bl.shift >> bl.shift
|
||||
return l, h
|
||||
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2016 Bo-Yi Wu
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,181 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// ToString convert any type to string
|
||||
func ToString(value interface{}) interface{} {
|
||||
if v, ok := value.(*string); ok {
|
||||
return *v
|
||||
}
|
||||
return fmt.Sprintf("%v", value)
|
||||
}
|
||||
|
||||
// ToBool convert any type to boolean
|
||||
func ToBool(value interface{}) interface{} {
|
||||
switch value := value.(type) {
|
||||
case bool:
|
||||
return value
|
||||
case *bool:
|
||||
return *value
|
||||
case string:
|
||||
switch value {
|
||||
case "", "false":
|
||||
return false
|
||||
}
|
||||
return true
|
||||
case *string:
|
||||
return ToBool(*value)
|
||||
case float64:
|
||||
if value != 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
case *float64:
|
||||
return ToBool(*value)
|
||||
case float32:
|
||||
if value != 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
case *float32:
|
||||
return ToBool(*value)
|
||||
case int:
|
||||
if value != 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
case *int:
|
||||
return ToBool(*value)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ToInt convert any type to int
|
||||
func ToInt(value interface{}) interface{} {
|
||||
switch value := value.(type) {
|
||||
case bool:
|
||||
if value == true {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
case int:
|
||||
if value < int(math.MinInt32) || value > int(math.MaxInt32) {
|
||||
return nil
|
||||
}
|
||||
return value
|
||||
case *int:
|
||||
return ToInt(*value)
|
||||
case int8:
|
||||
return int(value)
|
||||
case *int8:
|
||||
return int(*value)
|
||||
case int16:
|
||||
return int(value)
|
||||
case *int16:
|
||||
return int(*value)
|
||||
case int32:
|
||||
return int(value)
|
||||
case *int32:
|
||||
return int(*value)
|
||||
case int64:
|
||||
if value < int64(math.MinInt32) || value > int64(math.MaxInt32) {
|
||||
return nil
|
||||
}
|
||||
return int(value)
|
||||
case *int64:
|
||||
return ToInt(*value)
|
||||
case uint:
|
||||
if value > math.MaxInt32 {
|
||||
return nil
|
||||
}
|
||||
return int(value)
|
||||
case *uint:
|
||||
return ToInt(*value)
|
||||
case uint8:
|
||||
return int(value)
|
||||
case *uint8:
|
||||
return int(*value)
|
||||
case uint16:
|
||||
return int(value)
|
||||
case *uint16:
|
||||
return int(*value)
|
||||
case uint32:
|
||||
if value > uint32(math.MaxInt32) {
|
||||
return nil
|
||||
}
|
||||
return int(value)
|
||||
case *uint32:
|
||||
return ToInt(*value)
|
||||
case uint64:
|
||||
if value > uint64(math.MaxInt32) {
|
||||
return nil
|
||||
}
|
||||
return int(value)
|
||||
case *uint64:
|
||||
return ToInt(*value)
|
||||
case float32:
|
||||
if value < float32(math.MinInt32) || value > float32(math.MaxInt32) {
|
||||
return nil
|
||||
}
|
||||
return int(value)
|
||||
case *float32:
|
||||
return ToInt(*value)
|
||||
case float64:
|
||||
if value < float64(math.MinInt32) || value > float64(math.MaxInt32) {
|
||||
return nil
|
||||
}
|
||||
return int(value)
|
||||
case *float64:
|
||||
return ToInt(*value)
|
||||
case string:
|
||||
val, err := strconv.ParseFloat(value, 0)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return ToInt(val)
|
||||
case *string:
|
||||
return ToInt(*value)
|
||||
}
|
||||
|
||||
// If the value cannot be transformed into an int, return nil instead of '0'
|
||||
// to denote 'no integer found'
|
||||
return nil
|
||||
}
|
||||
|
||||
// ToFloat convert any type to float
|
||||
func ToFloat(value interface{}) interface{} {
|
||||
switch value := value.(type) {
|
||||
case bool:
|
||||
if value == true {
|
||||
return 1.0
|
||||
}
|
||||
return 0.0
|
||||
case *bool:
|
||||
return ToFloat(*value)
|
||||
case int:
|
||||
return float64(value)
|
||||
case *int32:
|
||||
return ToFloat(*value)
|
||||
case float32:
|
||||
return value
|
||||
case *float32:
|
||||
return ToFloat(*value)
|
||||
case float64:
|
||||
return value
|
||||
case *float64:
|
||||
return ToFloat(*value)
|
||||
case string:
|
||||
val, err := strconv.ParseFloat(value, 0)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return val
|
||||
case *string:
|
||||
return ToFloat(*value)
|
||||
}
|
||||
return 0.0
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
# Changelog
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
||||
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [1.3.0] - 2017-12-12
|
||||
* Add `DB.NextSequence()` method to generate monotonically increasing integer
|
||||
sequences.
|
||||
* Add `DB.Size()` method to return the size of LSM and value log files.
|
||||
* Tweaked mmap code to make Windows 32-bit builds work.
|
||||
* Tweaked build tags on some files to make iOS builds work.
|
||||
* Fix `DB.PurgeOlderVersions()` to not violate some constraints.
|
||||
|
||||
## [1.2.0] - 2017-11-30
|
||||
* Expose a `Txn.SetEntry()` method to allow setting the key-value pair
|
||||
and all the metadata at the same time.
|
||||
|
||||
## [1.1.1] - 2017-11-28
|
||||
* Fix bug where txn.Get was returing key deleted in same transaction.
|
||||
* Fix race condition while decrementing reference in oracle.
|
||||
* Update doneCommit in the callback for CommitAsync.
|
||||
* Iterator see writes of current txn.
|
||||
|
||||
## [1.1.0] - 2017-11-13
|
||||
* Create Badger directory if it does not exist when `badger.Open` is called.
|
||||
* Added `Item.ValueCopy()` to avoid deadlocks in long-running iterations
|
||||
* Fixed 64-bit alignment issues to make Badger run on Arm v7
|
||||
|
||||
## [1.0.1] - 2017-11-06
|
||||
* Fix an uint16 overflow when resizing key slice
|
||||
|
||||
[Unreleased]: https://github.com/dgraph-io/badger/compare/v1.3.0...HEAD
|
||||
[1.3.0]: https://github.com/dgraph-io/badger/compare/v1.2.0...v1.3.0
|
||||
[1.2.0]: https://github.com/dgraph-io/badger/compare/v1.1.1...v1.2.0
|
||||
[1.1.1]: https://github.com/dgraph-io/badger/compare/v1.1.0...v1.1.1
|
||||
[1.1.0]: https://github.com/dgraph-io/badger/compare/v1.0.1...v1.1.0
|
||||
[1.0.1]: https://github.com/dgraph-io/badger/compare/v1.0.0...v1.0.1
|
|
@ -0,0 +1,176 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
|
@ -0,0 +1,623 @@
|
|||
# BadgerDB [![GoDoc](https://godoc.org/github.com/dgraph-io/badger?status.svg)](https://godoc.org/github.com/dgraph-io/badger) [![Go Report Card](https://goreportcard.com/badge/github.com/dgraph-io/badger)](https://goreportcard.com/report/github.com/dgraph-io/badger) [![Build Status](https://teamcity.dgraph.io/guestAuth/app/rest/builds/buildType:(id:Badger_UnitTests)/statusIcon.svg)](https://teamcity.dgraph.io/viewLog.html?buildTypeId=Badger_UnitTests&buildId=lastFinished&guest=1) ![Appveyor](https://ci.appveyor.com/api/projects/status/github/dgraph-io/badger?branch=master&svg=true) [![Coverage Status](https://coveralls.io/repos/github/dgraph-io/badger/badge.svg?branch=master)](https://coveralls.io/github/dgraph-io/badger?branch=master)
|
||||
|
||||
![Badger mascot](images/diggy-shadow.png)
|
||||
|
||||
BadgerDB is an embeddable, persistent, simple and fast key-value (KV) database
|
||||
written in pure Go. It's meant to be a performant alternative to non-Go-based
|
||||
key-value stores like [RocksDB](https://github.com/facebook/rocksdb).
|
||||
|
||||
## Project Status
|
||||
Badger v1.0 was released in Nov 2017. Check the [Changelog] for the full details.
|
||||
|
||||
[Changelog]:https://github.com/dgraph-io/badger/blob/master/CHANGELOG.md
|
||||
|
||||
We introduced transactions in [v0.9.0] which involved a major API change. If you have a Badger
|
||||
datastore prior to that, please use [v0.8.1], but we strongly urge you to upgrade. Upgrading from
|
||||
both v0.8 and v0.9 will require you to [take backups](#database-backup) and restore using the new
|
||||
version.
|
||||
|
||||
[v1.0.1]: //github.com/dgraph-io/badger/tree/v1.0.1
|
||||
[v0.8.1]: //github.com/dgraph-io/badger/tree/v0.8.1
|
||||
[v0.9.0]: //github.com/dgraph-io/badger/tree/v0.9.0
|
||||
|
||||
## Table of Contents
|
||||
* [Getting Started](#getting-started)
|
||||
+ [Installing](#installing)
|
||||
+ [Opening a database](#opening-a-database)
|
||||
+ [Transactions](#transactions)
|
||||
- [Read-only transactions](#read-only-transactions)
|
||||
- [Read-write transactions](#read-write-transactions)
|
||||
- [Managing transactions manually](#managing-transactions-manually)
|
||||
+ [Using key/value pairs](#using-keyvalue-pairs)
|
||||
+ [Monotonically increasing integers](#monotonically-increasing-integers)
|
||||
* [Merge Operations](#merge-operations)
|
||||
+ [Setting Time To Live(TTL) and User Metadata on Keys](#setting-time-to-livettl-and-user-metadata-on-keys)
|
||||
+ [Iterating over keys](#iterating-over-keys)
|
||||
- [Prefix scans](#prefix-scans)
|
||||
- [Key-only iteration](#key-only-iteration)
|
||||
+ [Garbage Collection](#garbage-collection)
|
||||
+ [Database backup](#database-backup)
|
||||
+ [Memory usage](#memory-usage)
|
||||
+ [Statistics](#statistics)
|
||||
* [Resources](#resources)
|
||||
+ [Blog Posts](#blog-posts)
|
||||
* [Contact](#contact)
|
||||
* [Design](#design)
|
||||
+ [Comparisons](#comparisons)
|
||||
+ [Benchmarks](#benchmarks)
|
||||
* [Other Projects Using Badger](#other-projects-using-badger)
|
||||
* [Frequently Asked Questions](#frequently-asked-questions)
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Installing
|
||||
To start using Badger, install Go 1.8 or above and run `go get`:
|
||||
|
||||
```sh
|
||||
$ go get github.com/dgraph-io/badger/...
|
||||
```
|
||||
|
||||
This will retrieve the library and install the `badger_info` command line
|
||||
utility into your `$GOBIN` path.
|
||||
|
||||
|
||||
### Opening a database
|
||||
The top-level object in Badger is a `DB`. It represents multiple files on disk
|
||||
in specific directories, which contain the data for a single database.
|
||||
|
||||
To open your database, use the `badger.Open()` function, with the appropriate
|
||||
options. The `Dir` and `ValueDir` options are mandatory and must be
|
||||
specified by the client. They can be set to the same value to simplify things.
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/dgraph-io/badger"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Open the Badger database located in the /tmp/badger directory.
|
||||
// It will be created if it doesn't exist.
|
||||
opts := badger.DefaultOptions
|
||||
opts.Dir = "/tmp/badger"
|
||||
opts.ValueDir = "/tmp/badger"
|
||||
db, err := badger.Open(opts)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
// Your code here…
|
||||
}
|
||||
```
|
||||
|
||||
Please note that Badger obtains a lock on the directories so multiple processes
|
||||
cannot open the same database at the same time.
|
||||
|
||||
### Transactions
|
||||
|
||||
#### Read-only transactions
|
||||
To start a read-only transaction, you can use the `DB.View()` method:
|
||||
|
||||
```go
|
||||
err := db.View(func(txn *badger.Txn) error {
|
||||
// Your code here…
|
||||
return nil
|
||||
})
|
||||
```
|
||||
|
||||
You cannot perform any writes or deletes within this transaction. Badger
|
||||
ensures that you get a consistent view of the database within this closure. Any
|
||||
writes that happen elsewhere after the transaction has started, will not be
|
||||
seen by calls made within the closure.
|
||||
|
||||
#### Read-write transactions
|
||||
To start a read-write transaction, you can use the `DB.Update()` method:
|
||||
|
||||
```go
|
||||
err := db.Update(func(txn *badger.Txn) error {
|
||||
// Your code here…
|
||||
return nil
|
||||
})
|
||||
```
|
||||
|
||||
All database operations are allowed inside a read-write transaction.
|
||||
|
||||
Always check the returned error value. If you return an error
|
||||
within your closure it will be passed through.
|
||||
|
||||
An `ErrConflict` error will be reported in case of a conflict. Depending on the state
|
||||
of your application, you have the option to retry the operation if you receive
|
||||
this error.
|
||||
|
||||
An `ErrTxnTooBig` will be reported in case the number of pending writes/deletes in
|
||||
the transaction exceed a certain limit. In that case, it is best to commit the
|
||||
transaction and start a new transaction immediately. Here is an example (we are
|
||||
not checking for errors in some places for simplicity):
|
||||
|
||||
```go
|
||||
updates := make(map[string]string)
|
||||
txn := db.NewTransaction(true)
|
||||
for k,v := range updates {
|
||||
if err := txn.Set([]byte(k),[]byte(v)); err == ErrTxnTooBig {
|
||||
_ = txn.Commit()
|
||||
txn = db.NewTransaction(..)
|
||||
_ = txn.Set([]byte(k),[]byte(v))
|
||||
}
|
||||
}
|
||||
_ = txn.Commit()
|
||||
```
|
||||
|
||||
#### Managing transactions manually
|
||||
The `DB.View()` and `DB.Update()` methods are wrappers around the
|
||||
`DB.NewTransaction()` and `Txn.Commit()` methods (or `Txn.Discard()` in case of
|
||||
read-only transactions). These helper methods will start the transaction,
|
||||
execute a function, and then safely discard your transaction if an error is
|
||||
returned. This is the recommended way to use Badger transactions.
|
||||
|
||||
However, sometimes you may want to manually create and commit your
|
||||
transactions. You can use the `DB.NewTransaction()` function directly, which
|
||||
takes in a boolean argument to specify whether a read-write transaction is
|
||||
required. For read-write transactions, it is necessary to call `Txn.Commit()`
|
||||
to ensure the transaction is committed. For read-only transactions, calling
|
||||
`Txn.Discard()` is sufficient. `Txn.Commit()` also calls `Txn.Discard()`
|
||||
internally to cleanup the transaction, so just calling `Txn.Commit()` is
|
||||
sufficient for read-write transaction. However, if your code doesn’t call
|
||||
`Txn.Commit()` for some reason (for e.g it returns prematurely with an error),
|
||||
then please make sure you call `Txn.Discard()` in a `defer` block. Refer to the
|
||||
code below.
|
||||
|
||||
```go
|
||||
// Start a writable transaction.
|
||||
txn, err := db.NewTransaction(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer txn.Discard()
|
||||
|
||||
// Use the transaction...
|
||||
err := txn.Set([]byte("answer"), []byte("42"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Commit the transaction and check for error.
|
||||
if err := txn.Commit(nil); err != nil {
|
||||
return err
|
||||
}
|
||||
```
|
||||
|
||||
The first argument to `DB.NewTransaction()` is a boolean stating if the transaction
|
||||
should be writable.
|
||||
|
||||
Badger allows an optional callback to the `Txn.Commit()` method. Normally, the
|
||||
callback can be set to `nil`, and the method will return after all the writes
|
||||
have succeeded. However, if this callback is provided, the `Txn.Commit()`
|
||||
method returns as soon as it has checked for any conflicts. The actual writing
|
||||
to the disk happens asynchronously, and the callback is invoked once the
|
||||
writing has finished, or an error has occurred. This can improve the throughput
|
||||
of the application in some cases. But it also means that a transaction is not
|
||||
durable until the callback has been invoked with a `nil` error value.
|
||||
|
||||
### Using key/value pairs
|
||||
To save a key/value pair, use the `Txn.Set()` method:
|
||||
|
||||
```go
|
||||
err := db.Update(func(txn *badger.Txn) error {
|
||||
err := txn.Set([]byte("answer"), []byte("42"))
|
||||
return err
|
||||
})
|
||||
```
|
||||
|
||||
This will set the value of the `"answer"` key to `"42"`. To retrieve this
|
||||
value, we can use the `Txn.Get()` method:
|
||||
|
||||
```go
|
||||
err := db.View(func(txn *badger.Txn) error {
|
||||
item, err := txn.Get([]byte("answer"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
val, err := item.Value()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("The answer is: %s\n", val)
|
||||
return nil
|
||||
})
|
||||
```
|
||||
|
||||
`Txn.Get()` returns `ErrKeyNotFound` if the value is not found.
|
||||
|
||||
Please note that values returned from `Get()` are only valid while the
|
||||
transaction is open. If you need to use a value outside of the transaction
|
||||
then you must use `copy()` to copy it to another byte slice.
|
||||
|
||||
Use the `Txn.Delete()` method to delete a key.
|
||||
|
||||
### Monotonically increasing integers
|
||||
|
||||
To get unique monotonically increasing integers with strong durability, you can
|
||||
use the `DB.GetSequence` method. This method returns a `Sequence` object, which
|
||||
is thread-safe and can be used concurrently via various goroutines.
|
||||
|
||||
Badger would lease a range of integers to hand out from memory, with the
|
||||
bandwidth provided to `DB.GetSequence`. The frequency at which disk writes are
|
||||
done is determined by this lease bandwidth and the frequency of `Next`
|
||||
invocations. Setting a bandwith too low would do more disk writes, setting it
|
||||
too high would result in wasted integers if Badger is closed or crashes.
|
||||
To avoid wasted integers, call `Release` before closing Badger.
|
||||
|
||||
```go
|
||||
seq, err := db.GetSequence(key, 1000)
|
||||
defer seq.Release()
|
||||
for {
|
||||
num, err := seq.Next()
|
||||
}
|
||||
```
|
||||
|
||||
### Merge Operations
|
||||
Badger provides support for unordered merge operations. You can define a func
|
||||
of type `MergeFunc` which takes in an existing value, and a value to be
|
||||
_merged_ with it. It returns a new value which is the result of the _merge_
|
||||
operation. All values are specified in byte arrays. For e.g., here is a merge
|
||||
function (`add`) which adds a `uint64` value to an existing `uint64` value.
|
||||
|
||||
```Go
|
||||
uint64ToBytes(i uint64) []byte {
|
||||
var buf [8]byte
|
||||
binary.BigEndian.PutUint64(buf[:], i)
|
||||
return buf[:]
|
||||
}
|
||||
|
||||
func bytesToUint64(b []byte) uint64 {
|
||||
return binary.BigEndian.Uint64(b)
|
||||
}
|
||||
|
||||
// Merge function to add two uint64 numbers
|
||||
func add(existing, new []byte) []byte {
|
||||
return uint64ToBytes(bytesToUint64(existing) + bytesToUint64(new))
|
||||
}
|
||||
```
|
||||
|
||||
This function can then be passed to the `DB.GetMergeOperator()` method, along
|
||||
with a key, and a duration value. The duration specifies how often the merge
|
||||
function is run on values that have been added using the `MergeOperator.Add()`
|
||||
method.
|
||||
|
||||
`MergeOperator.Get()` method can be used to retrieve the cumulative value of the key
|
||||
associated with the merge operation.
|
||||
|
||||
```Go
|
||||
key := []byte("merge")
|
||||
m := db.GetMergeOperator(key, add, 200*time.Millisecond)
|
||||
defer m.Stop()
|
||||
|
||||
m.Add(uint64ToBytes(1))
|
||||
m.Add(uint64ToBytes(2))
|
||||
m.Add(uint64ToBytes(3))
|
||||
|
||||
res, err := m.Get() // res should have value 6 encoded
|
||||
fmt.Println(bytesToUint64(res))
|
||||
```
|
||||
|
||||
### Setting Time To Live(TTL) and User Metadata on Keys
|
||||
Badger allows setting an optional Time to Live (TTL) value on keys. Once the TTL has
|
||||
elapsed, the key will no longer be retrievable and will be eligible for garbage
|
||||
collection. A TTL can be set as a `time.Duration` value using the `Txn.SetWithTTL()`
|
||||
API method.
|
||||
|
||||
An optional user metadata value can be set on each key. A user metadata value
|
||||
is represented by a single byte. It can be used to set certain bits along
|
||||
with the key to aid in interpreting or decoding the key-value pair. User
|
||||
metadata can be set using the `Txn.SetWithMeta()` API method.
|
||||
|
||||
`Txn.SetEntry()` can be used to set the key, value, user metatadata and TTL,
|
||||
all at once.
|
||||
|
||||
### Iterating over keys
|
||||
To iterate over keys, we can use an `Iterator`, which can be obtained using the
|
||||
`Txn.NewIterator()` method. Iteration happens in byte-wise lexicographical sorting
|
||||
order.
|
||||
|
||||
|
||||
```go
|
||||
err := db.View(func(txn *badger.Txn) error {
|
||||
opts := badger.DefaultIteratorOptions
|
||||
opts.PrefetchSize = 10
|
||||
it := txn.NewIterator(opts)
|
||||
defer it.Close()
|
||||
for it.Rewind(); it.Valid(); it.Next() {
|
||||
item := it.Item()
|
||||
k := item.Key()
|
||||
v, err := item.Value()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("key=%s, value=%s\n", k, v)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
```
|
||||
|
||||
The iterator allows you to move to a specific point in the list of keys and move
|
||||
forward or backward through the keys one at a time.
|
||||
|
||||
By default, Badger prefetches the values of the next 100 items. You can adjust
|
||||
that with the `IteratorOptions.PrefetchSize` field. However, setting it to
|
||||
a value higher than GOMAXPROCS (which we recommend to be 128 or higher)
|
||||
shouldn’t give any additional benefits. You can also turn off the fetching of
|
||||
values altogether. See section below on key-only iteration.
|
||||
|
||||
#### Prefix scans
|
||||
To iterate over a key prefix, you can combine `Seek()` and `ValidForPrefix()`:
|
||||
|
||||
```go
|
||||
db.View(func(txn *badger.Txn) error {
|
||||
it := txn.NewIterator(badger.DefaultIteratorOptions)
|
||||
defer it.Close()
|
||||
prefix := []byte("1234")
|
||||
for it.Seek(prefix); it.ValidForPrefix(prefix); it.Next() {
|
||||
item := it.Item()
|
||||
k := item.Key()
|
||||
v, err := item.Value()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("key=%s, value=%s\n", k, v)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
```
|
||||
|
||||
#### Key-only iteration
|
||||
Badger supports a unique mode of iteration called _key-only_ iteration. It is
|
||||
several order of magnitudes faster than regular iteration, because it involves
|
||||
access to the LSM-tree only, which is usually resident entirely in RAM. To
|
||||
enable key-only iteration, you need to set the `IteratorOptions.PrefetchValues`
|
||||
field to `false`. This can also be used to do sparse reads for selected keys
|
||||
during an iteration, by calling `item.Value()` only when required.
|
||||
|
||||
```go
|
||||
err := db.View(func(txn *badger.Txn) error {
|
||||
opts := badger.DefaultIteratorOptions
|
||||
opts.PrefetchValues = false
|
||||
it := txn.NewIterator(opts)
|
||||
defer it.Close()
|
||||
for it.Rewind(); it.Valid(); it.Next() {
|
||||
item := it.Item()
|
||||
k := item.Key()
|
||||
fmt.Printf("key=%s\n", k)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
```
|
||||
|
||||
### Garbage Collection
|
||||
Badger values need to be garbage collected, because of two reasons:
|
||||
|
||||
* Badger keeps values separately from the LSM tree. This means that the compaction operations
|
||||
that clean up the LSM tree do not touch the values at all. Values need to be cleaned up
|
||||
separately.
|
||||
|
||||
* Concurrent read/write transactions could leave behind multiple values for a single key, because they
|
||||
are stored with different versions. These could accumulate, and take up unneeded space beyond the
|
||||
time these older versions are needed.
|
||||
|
||||
Badger relies on the client to perform garbage collection at a time of their choosing. It provides
|
||||
the following methods, which can be invoked at an appropriate time:
|
||||
|
||||
* `DB.PurgeOlderVersions()`: This method iterates over the database, and cleans up all but the latest
|
||||
versions of the key-value pairs. It marks the older versions as deleted, which makes them eligible for
|
||||
garbage collection.
|
||||
* `DB.PurgeVersionsBelow(key, ts)`: This method is useful to do a more targeted clean up of older versions
|
||||
of key-value pairs. You can specify a key, and a timestamp. All versions of the key older than the timestamp
|
||||
are marked as deleted, making them eligible for garbage collection.
|
||||
* `DB.RunValueLogGC()`: This method is designed to do garbage collection while
|
||||
Badger is online. Please ensure that you call the `DB.Purge…()` methods first
|
||||
before invoking this method. It uses any statistics generated by the
|
||||
`DB.Purge(…)` methods to pick files that are likely to lead to maximum space
|
||||
reclamation. It loops until it encounters a file which does not lead to any
|
||||
garbage collection.
|
||||
|
||||
It could lead to increased I/O if `DB.RunValueLogGC()` hasn’t been called for
|
||||
a long time, and many deletes have happened in the meanwhile. So it is recommended
|
||||
that this method be called regularly.
|
||||
|
||||
### Database backup
|
||||
There are two public API methods `DB.Backup()` and `DB.Load()` which can be
|
||||
used to do online backups and restores. Badger v0.9 provides a CLI tool
|
||||
`badger`, which can do offline backup/restore. Make sure you have `$GOPATH/bin`
|
||||
in your PATH to use this tool.
|
||||
|
||||
The command below will create a version-agnostic backup of the database, to a
|
||||
file `badger.bak` in the current working directory
|
||||
|
||||
```
|
||||
badger backup --dir <path/to/badgerdb>
|
||||
```
|
||||
|
||||
To restore `badger.bak` in the current working directory to a new database:
|
||||
|
||||
```
|
||||
badger restore --dir <path/to/badgerdb>
|
||||
```
|
||||
|
||||
See `badger --help` for more details.
|
||||
|
||||
If you have a Badger database that was created using v0.8 (or below), you can
|
||||
use the `badger_backup` tool provided in v0.8.1, and then restore it using the
|
||||
command above to upgrade your database to work with the latest version.
|
||||
|
||||
```
|
||||
badger_backup --dir <path/to/badgerdb> --backup-file badger.bak
|
||||
```
|
||||
|
||||
### Memory usage
|
||||
Badger's memory usage can be managed by tweaking several options available in
|
||||
the `Options` struct that is passed in when opening the database using
|
||||
`DB.Open`.
|
||||
|
||||
- `Options.ValueLogLoadingMode` can be set to `options.FileIO` (instead of the
|
||||
default `options.MemoryMap`) to avoid memory-mapping log files. This can be
|
||||
useful in environments with low RAM.
|
||||
- Number of memtables (`Options.NumMemtables`)
|
||||
- If you modify `Options.NumMemtables`, also adjust `Options.NumLevelZeroTables` and
|
||||
`Options.NumLevelZeroTablesStall` accordingly.
|
||||
- Number of concurrent compactions (`Options.NumCompactors`)
|
||||
- Mode in which LSM tree is loaded (`Options.TableLoadingMode`)
|
||||
- Size of table (`Options.MaxTableSize`)
|
||||
- Size of value log file (`Options.ValueLogFileSize`)
|
||||
|
||||
If you want to decrease the memory usage of Badger instance, tweak these
|
||||
options (ideally one at a time) until you achieve the desired
|
||||
memory usage.
|
||||
|
||||
### Statistics
|
||||
Badger records metrics using the [expvar] package, which is included in the Go
|
||||
standard library. All the metrics are documented in [y/metrics.go][metrics]
|
||||
file.
|
||||
|
||||
`expvar` package adds a handler in to the default HTTP server (which has to be
|
||||
started explicitly), and serves up the metrics at the `/debug/vars` endpoint.
|
||||
These metrics can then be collected by a system like [Prometheus], to get
|
||||
better visibility into what Badger is doing.
|
||||
|
||||
[expvar]: https://golang.org/pkg/expvar/
|
||||
[metrics]: https://github.com/dgraph-io/badger/blob/master/y/metrics.go
|
||||
[Prometheus]: https://prometheus.io/
|
||||
|
||||
## Resources
|
||||
|
||||
### Blog Posts
|
||||
1. [Introducing Badger: A fast key-value store written natively in
|
||||
Go](https://open.dgraph.io/post/badger/)
|
||||
2. [Make Badger crash resilient with ALICE](https://blog.dgraph.io/post/alice/)
|
||||
3. [Badger vs LMDB vs BoltDB: Benchmarking key-value databases in Go](https://blog.dgraph.io/post/badger-lmdb-boltdb/)
|
||||
4. [Concurrent ACID Transactions in Badger](https://blog.dgraph.io/post/badger-txn/)
|
||||
|
||||
## Design
|
||||
Badger was written with these design goals in mind:
|
||||
|
||||
- Write a key-value database in pure Go.
|
||||
- Use latest research to build the fastest KV database for data sets spanning terabytes.
|
||||
- Optimize for SSDs.
|
||||
|
||||
Badger’s design is based on a paper titled _[WiscKey: Separating Keys from
|
||||
Values in SSD-conscious Storage][wisckey]_.
|
||||
|
||||
[wisckey]: https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf
|
||||
|
||||
### Comparisons
|
||||
| Feature | Badger | RocksDB | BoltDB |
|
||||
| ------- | ------ | ------- | ------ |
|
||||
| Design | LSM tree with value log | LSM tree only | B+ tree |
|
||||
| High Read throughput | Yes | No | Yes |
|
||||
| High Write throughput | Yes | Yes | No |
|
||||
| Designed for SSDs | Yes (with latest research <sup>1</sup>) | Not specifically <sup>2</sup> | No |
|
||||
| Embeddable | Yes | Yes | Yes |
|
||||
| Sorted KV access | Yes | Yes | Yes |
|
||||
| Pure Go (no Cgo) | Yes | No | Yes |
|
||||
| Transactions | Yes, ACID, concurrent with SSI<sup>3</sup> | Yes (but non-ACID) | Yes, ACID |
|
||||
| Snapshots | Yes | Yes | Yes |
|
||||
| TTL support | Yes | Yes | No |
|
||||
|
||||
<sup>1</sup> The [WISCKEY paper][wisckey] (on which Badger is based) saw big
|
||||
wins with separating values from keys, significantly reducing the write
|
||||
amplification compared to a typical LSM tree.
|
||||
|
||||
<sup>2</sup> RocksDB is an SSD optimized version of LevelDB, which was designed specifically for rotating disks.
|
||||
As such RocksDB's design isn't aimed at SSDs.
|
||||
|
||||
<sup>3</sup> SSI: Serializable Snapshot Isolation. For more details, see the blog post [Concurrent ACID Transactions in Badger](https://blog.dgraph.io/post/badger-txn/)
|
||||
|
||||
### Benchmarks
|
||||
We have run comprehensive benchmarks against RocksDB, Bolt and LMDB. The
|
||||
benchmarking code, and the detailed logs for the benchmarks can be found in the
|
||||
[badger-bench] repo. More explanation, including graphs can be found the blog posts (linked
|
||||
above).
|
||||
|
||||
[badger-bench]: https://github.com/dgraph-io/badger-bench
|
||||
|
||||
## Other Projects Using Badger
|
||||
Below is a list of known projects that use Badger:
|
||||
|
||||
* [Usenet Express](https://usenetexpress.com/) - Serving over 300TB of data with Badger.
|
||||
* [Dgraph](https://github.com/dgraph-io/dgraph) - Distributed graph database.
|
||||
* [go-ipfs](https://github.com/ipfs/go-ipfs) - Go client for the InterPlanetary File System (IPFS), a new hypermedia distribution protocol.
|
||||
* [0-stor](https://github.com/zero-os/0-stor) - Single device object store.
|
||||
* [Sandglass](https://github.com/celrenheit/sandglass) - distributed, horizontally scalable, persistent, time sorted message queue.
|
||||
|
||||
If you are using Badger in a project please send a pull request to add it to the list.
|
||||
|
||||
## Frequently Asked Questions
|
||||
- **My writes are getting stuck. Why?**
|
||||
|
||||
This can happen if a long running iteration with `Prefetch` is set to false, but
|
||||
a `Item::Value` call is made internally in the loop. That causes Badger to
|
||||
acquire read locks over the value log files to avoid value log GC removing the
|
||||
file from underneath. As a side effect, this also blocks a new value log GC
|
||||
file from being created, when the value log file boundary is hit.
|
||||
|
||||
Please see Github issues [#293](https://github.com/dgraph-io/badger/issues/293)
|
||||
and [#315](https://github.com/dgraph-io/badger/issues/315).
|
||||
|
||||
There are multiple workarounds during iteration:
|
||||
|
||||
1. Use `Item::ValueCopy` instead of `Item::Value` when retrieving value.
|
||||
1. Set `Prefetch` to true. Badger would then copy over the value and release the
|
||||
file lock immediately.
|
||||
1. When `Prefetch` is false, don't call `Item::Value` and do a pure key-only
|
||||
iteration. This might be useful if you just want to delete a lot of keys.
|
||||
1. Do the writes in a separate transaction after the reads.
|
||||
|
||||
- **My writes are really slow. Why?**
|
||||
|
||||
Are you creating a new transaction for every single key update? This will lead
|
||||
to very low throughput. To get best write performance, batch up multiple writes
|
||||
inside a transaction using single `DB.Update()` call. You could also have
|
||||
multiple such `DB.Update()` calls being made concurrently from multiple
|
||||
goroutines.
|
||||
|
||||
- **I don't see any disk write. Why?**
|
||||
|
||||
If you're using Badger with `SyncWrites=false`, then your writes might not be written to value log
|
||||
and won't get synced to disk immediately. Writes to LSM tree are done inmemory first, before they
|
||||
get compacted to disk. The compaction would only happen once `MaxTableSize` has been reached. So, if
|
||||
you're doing a few writes and then checking, you might not see anything on disk. Once you `Close`
|
||||
the database, you'll see these writes on disk.
|
||||
|
||||
- **Reverse iteration doesn't give me the right results.**
|
||||
|
||||
Just like forward iteration goes to the first key which is equal or greater than the SEEK key, reverse iteration goes to the first key which is equal or lesser than the SEEK key. Therefore, SEEK key would not be part of the results. You can typically add a tilde (~) as a suffix to the SEEK key to include it in the results. See the following issues: [#436](https://github.com/dgraph-io/badger/issues/436) and [#347](https://github.com/dgraph-io/badger/issues/347).
|
||||
|
||||
- **Which instances should I use for Badger?**
|
||||
|
||||
We recommend using instances which provide local SSD storage, without any limit
|
||||
on the maximum IOPS. In AWS, these are storage optimized instances like i3. They
|
||||
provide local SSDs which clock 100K IOPS over 4KB blocks easily.
|
||||
|
||||
- **I'm getting a closed channel error. Why?**
|
||||
|
||||
```
|
||||
panic: close of closed channel
|
||||
panic: send on closed channel
|
||||
```
|
||||
|
||||
If you're seeing panics like above, this would be because you're operating on a closed DB. This can happen, if you call `Close()` before sending a write, or multiple times. You should ensure that you only call `Close()` once, and all your read/write operations finish before closing.
|
||||
|
||||
- **Are there any Go specific settings that I should use?**
|
||||
|
||||
We *highly* recommend setting a high number for GOMAXPROCS, which allows Go to
|
||||
observe the full IOPS throughput provided by modern SSDs. In Dgraph, we have set
|
||||
it to 128. For more details, [see this
|
||||
thread](https://groups.google.com/d/topic/golang-nuts/jPb_h3TvlKE/discussion).
|
||||
|
||||
## Contact
|
||||
- Please use [discuss.dgraph.io](https://discuss.dgraph.io) for questions, feature requests and discussions.
|
||||
- Please use [Github issue tracker](https://github.com/dgraph-io/badger/issues) for filing bugs or feature requests.
|
||||
- Join [![Slack Status](http://slack.dgraph.io/badge.svg)](http://slack.dgraph.io).
|
||||
- Follow us on Twitter [@dgraphlabs](https://twitter.com/dgraphlabs).
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
# version format
|
||||
version: "{build}"
|
||||
|
||||
# Operating system (build VM template)
|
||||
os: Windows Server 2012 R2
|
||||
|
||||
# Platform.
|
||||
platform: x64
|
||||
|
||||
clone_folder: c:\gopath\src\github.com\dgraph-io\badger
|
||||
|
||||
# Environment variables
|
||||
environment:
|
||||
GOVERSION: 1.8.3
|
||||
GOPATH: c:\gopath
|
||||
|
||||
# scripts that run after cloning repository
|
||||
install:
|
||||
- set PATH=%GOPATH%\bin;c:\go\bin;%PATH%
|
||||
- go version
|
||||
- go env
|
||||
- python --version
|
||||
|
||||
# To run your custom scripts instead of automatic MSBuild
|
||||
build_script:
|
||||
# We need to disable firewall - https://github.com/appveyor/ci/issues/1579#issuecomment-309830648
|
||||
- ps: Disable-NetFirewallRule -DisplayName 'File and Printer Sharing (SMB-Out)'
|
||||
- cd c:\gopath\src\github.com\dgraph-io\badger
|
||||
- git branch
|
||||
- go get -t ./...
|
||||
|
||||
# To run your custom scripts instead of automatic tests
|
||||
test_script:
|
||||
# Unit tests
|
||||
- ps: Add-AppveyorTest "Unit Tests" -Outcome Running
|
||||
- go test -v github.com/dgraph-io/badger/...
|
||||
- go test -v -vlog_mmap=false github.com/dgraph-io/badger/...
|
||||
- ps: Update-AppveyorTest "Unit Tests" -Outcome Passed
|
||||
|
||||
notifications:
|
||||
- provider: Email
|
||||
to:
|
||||
- pawan@dgraph.io
|
||||
on_build_failure: true
|
||||
on_build_status_changed: true
|
||||
# to disable deployment
|
||||
deploy: off
|
||||
|
|
@ -0,0 +1,158 @@
|
|||
package badger
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"log"
|
||||
"sync"
|
||||
|
||||
"github.com/dgraph-io/badger/y"
|
||||
|
||||
"github.com/dgraph-io/badger/protos"
|
||||
)
|
||||
|
||||
func writeTo(entry *protos.KVPair, w io.Writer) error {
|
||||
if err := binary.Write(w, binary.LittleEndian, uint64(entry.Size())); err != nil {
|
||||
return err
|
||||
}
|
||||
buf, err := entry.Marshal()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = w.Write(buf)
|
||||
return err
|
||||
}
|
||||
|
||||
// Backup dumps a protobuf-encoded list of all entries in the database into the
|
||||
// given writer, that are newer than the specified version. It returns a
|
||||
// timestamp indicating when the entries were dumped which can be passed into a
|
||||
// later invocation to generate an incremental dump, of entries that have been
|
||||
// added/modified since the last invocation of DB.Backup()
|
||||
//
|
||||
// This can be used to backup the data in a database at a given point in time.
|
||||
func (db *DB) Backup(w io.Writer, since uint64) (uint64, error) {
|
||||
var tsNew uint64
|
||||
err := db.View(func(txn *Txn) error {
|
||||
opts := DefaultIteratorOptions
|
||||
opts.AllVersions = true
|
||||
it := txn.NewIterator(opts)
|
||||
for it.Rewind(); it.Valid(); it.Next() {
|
||||
item := it.Item()
|
||||
if item.Version() < since {
|
||||
// Ignore versions less than given timestamp
|
||||
continue
|
||||
}
|
||||
val, err := item.Value()
|
||||
if err != nil {
|
||||
log.Printf("Key [%x]. Error while fetching value [%v]\n", item.Key(), err)
|
||||
continue
|
||||
}
|
||||
|
||||
entry := &protos.KVPair{
|
||||
Key: y.Copy(item.Key()),
|
||||
Value: y.Copy(val),
|
||||
UserMeta: []byte{item.UserMeta()},
|
||||
Version: item.Version(),
|
||||
ExpiresAt: item.ExpiresAt(),
|
||||
}
|
||||
|
||||
// Write entries to disk
|
||||
if err := writeTo(entry, w); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
tsNew = txn.readTs
|
||||
return nil
|
||||
})
|
||||
return tsNew, err
|
||||
}
|
||||
|
||||
// Load reads a protobuf-encoded list of all entries from a reader and writes
|
||||
// them to the database. This can be used to restore the database from a backup
|
||||
// made by calling DB.Backup().
|
||||
//
|
||||
// DB.Load() should be called on a database that is not running any other
|
||||
// concurrent transactions while it is running.
|
||||
func (db *DB) Load(r io.Reader) error {
|
||||
br := bufio.NewReaderSize(r, 16<<10)
|
||||
unmarshalBuf := make([]byte, 1<<10)
|
||||
var entries []*Entry
|
||||
var wg sync.WaitGroup
|
||||
errChan := make(chan error, 1)
|
||||
|
||||
// func to check for pending error before sending off a batch for writing
|
||||
batchSetAsyncIfNoErr := func(entries []*Entry) error {
|
||||
select {
|
||||
case err := <-errChan:
|
||||
return err
|
||||
default:
|
||||
wg.Add(1)
|
||||
return db.batchSetAsync(entries, func(err error) {
|
||||
defer wg.Done()
|
||||
if err != nil {
|
||||
select {
|
||||
case errChan <- err:
|
||||
default:
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
var sz uint64
|
||||
err := binary.Read(br, binary.LittleEndian, &sz)
|
||||
if err == io.EOF {
|
||||
break
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cap(unmarshalBuf) < int(sz) {
|
||||
unmarshalBuf = make([]byte, sz)
|
||||
}
|
||||
|
||||
e := &protos.KVPair{}
|
||||
if _, err = io.ReadFull(br, unmarshalBuf[:sz]); err != nil {
|
||||
return err
|
||||
}
|
||||
if err = e.Unmarshal(unmarshalBuf[:sz]); err != nil {
|
||||
return err
|
||||
}
|
||||
entries = append(entries, &Entry{
|
||||
Key: y.KeyWithTs(e.Key, e.Version),
|
||||
Value: e.Value,
|
||||
UserMeta: e.UserMeta[0],
|
||||
ExpiresAt: e.ExpiresAt,
|
||||
})
|
||||
// Update nextCommit, memtable stores this timestamp in badger head
|
||||
// when flushed.
|
||||
if e.Version >= db.orc.commitTs() {
|
||||
db.orc.nextCommit = e.Version + 1
|
||||
}
|
||||
|
||||
if len(entries) == 1000 {
|
||||
if err := batchSetAsyncIfNoErr(entries); err != nil {
|
||||
return err
|
||||
}
|
||||
entries = make([]*Entry, 0, 1000)
|
||||
}
|
||||
}
|
||||
|
||||
if len(entries) > 0 {
|
||||
if err := batchSetAsyncIfNoErr(entries); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
select {
|
||||
case err := <-errChan:
|
||||
return err
|
||||
default:
|
||||
db.orc.curRead = db.orc.commitTs() - 1
|
||||
return nil
|
||||
}
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
|
||||
"golang.org/x/net/trace"
|
||||
|
||||
"github.com/dgraph-io/badger/table"
|
||||
"github.com/dgraph-io/badger/y"
|
||||
)
|
||||
|
||||
type keyRange struct {
|
||||
left []byte
|
||||
right []byte
|
||||
inf bool
|
||||
}
|
||||
|
||||
var infRange = keyRange{inf: true}
|
||||
|
||||
func (r keyRange) String() string {
|
||||
return fmt.Sprintf("[left=%q, right=%q, inf=%v]", r.left, r.right, r.inf)
|
||||
}
|
||||
|
||||
func (r keyRange) equals(dst keyRange) bool {
|
||||
return bytes.Equal(r.left, dst.left) &&
|
||||
bytes.Equal(r.right, dst.right) &&
|
||||
r.inf == dst.inf
|
||||
}
|
||||
|
||||
func (r keyRange) overlapsWith(dst keyRange) bool {
|
||||
if r.inf || dst.inf {
|
||||
return true
|
||||
}
|
||||
|
||||
// If my left is greater than dst right, we have no overlap.
|
||||
if y.CompareKeys(r.left, dst.right) > 0 {
|
||||
return false
|
||||
}
|
||||
// If my right is less than dst left, we have no overlap.
|
||||
if y.CompareKeys(r.right, dst.left) < 0 {
|
||||
return false
|
||||
}
|
||||
// We have overlap.
|
||||
return true
|
||||
}
|
||||
|
||||
func getKeyRange(tables []*table.Table) keyRange {
|
||||
y.AssertTrue(len(tables) > 0)
|
||||
smallest := tables[0].Smallest()
|
||||
biggest := tables[0].Biggest()
|
||||
for i := 1; i < len(tables); i++ {
|
||||
if y.CompareKeys(tables[i].Smallest(), smallest) < 0 {
|
||||
smallest = tables[i].Smallest()
|
||||
}
|
||||
if y.CompareKeys(tables[i].Biggest(), biggest) > 0 {
|
||||
biggest = tables[i].Biggest()
|
||||
}
|
||||
}
|
||||
return keyRange{left: smallest, right: biggest}
|
||||
}
|
||||
|
||||
type levelCompactStatus struct {
|
||||
ranges []keyRange
|
||||
delSize int64
|
||||
}
|
||||
|
||||
func (lcs *levelCompactStatus) debug() string {
|
||||
var b bytes.Buffer
|
||||
for _, r := range lcs.ranges {
|
||||
b.WriteString(r.String())
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func (lcs *levelCompactStatus) overlapsWith(dst keyRange) bool {
|
||||
for _, r := range lcs.ranges {
|
||||
if r.overlapsWith(dst) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (lcs *levelCompactStatus) remove(dst keyRange) bool {
|
||||
final := lcs.ranges[:0]
|
||||
var found bool
|
||||
for _, r := range lcs.ranges {
|
||||
if !r.equals(dst) {
|
||||
final = append(final, r)
|
||||
} else {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
lcs.ranges = final
|
||||
return found
|
||||
}
|
||||
|
||||
type compactStatus struct {
|
||||
sync.RWMutex
|
||||
levels []*levelCompactStatus
|
||||
}
|
||||
|
||||
func (cs *compactStatus) toLog(tr trace.Trace) {
|
||||
cs.RLock()
|
||||
defer cs.RUnlock()
|
||||
|
||||
tr.LazyPrintf("Compaction status:")
|
||||
for i, l := range cs.levels {
|
||||
if len(l.debug()) == 0 {
|
||||
continue
|
||||
}
|
||||
tr.LazyPrintf("[%d] %s", i, l.debug())
|
||||
}
|
||||
}
|
||||
|
||||
func (cs *compactStatus) overlapsWith(level int, this keyRange) bool {
|
||||
cs.RLock()
|
||||
defer cs.RUnlock()
|
||||
|
||||
thisLevel := cs.levels[level]
|
||||
return thisLevel.overlapsWith(this)
|
||||
}
|
||||
|
||||
func (cs *compactStatus) delSize(l int) int64 {
|
||||
cs.RLock()
|
||||
defer cs.RUnlock()
|
||||
return cs.levels[l].delSize
|
||||
}
|
||||
|
||||
type thisAndNextLevelRLocked struct{}
|
||||
|
||||
// compareAndAdd will check whether we can run this compactDef. That it doesn't overlap with any
|
||||
// other running compaction. If it can be run, it would store this run in the compactStatus state.
|
||||
func (cs *compactStatus) compareAndAdd(_ thisAndNextLevelRLocked, cd compactDef) bool {
|
||||
cs.Lock()
|
||||
defer cs.Unlock()
|
||||
|
||||
level := cd.thisLevel.level
|
||||
|
||||
y.AssertTruef(level < len(cs.levels)-1, "Got level %d. Max levels: %d", level, len(cs.levels))
|
||||
thisLevel := cs.levels[level]
|
||||
nextLevel := cs.levels[level+1]
|
||||
|
||||
if thisLevel.overlapsWith(cd.thisRange) {
|
||||
return false
|
||||
}
|
||||
if nextLevel.overlapsWith(cd.nextRange) {
|
||||
return false
|
||||
}
|
||||
// Check whether this level really needs compaction or not. Otherwise, we'll end up
|
||||
// running parallel compactions for the same level.
|
||||
// NOTE: We can directly call thisLevel.totalSize, because we already have acquire a read lock
|
||||
// over this and the next level.
|
||||
if cd.thisLevel.totalSize-thisLevel.delSize < cd.thisLevel.maxTotalSize {
|
||||
return false
|
||||
}
|
||||
|
||||
thisLevel.ranges = append(thisLevel.ranges, cd.thisRange)
|
||||
nextLevel.ranges = append(nextLevel.ranges, cd.nextRange)
|
||||
thisLevel.delSize += cd.thisSize
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (cs *compactStatus) delete(cd compactDef) {
|
||||
cs.Lock()
|
||||
defer cs.Unlock()
|
||||
|
||||
level := cd.thisLevel.level
|
||||
y.AssertTruef(level < len(cs.levels)-1, "Got level %d. Max levels: %d", level, len(cs.levels))
|
||||
|
||||
thisLevel := cs.levels[level]
|
||||
nextLevel := cs.levels[level+1]
|
||||
|
||||
thisLevel.delSize -= cd.thisSize
|
||||
found := thisLevel.remove(cd.thisRange)
|
||||
found = nextLevel.remove(cd.nextRange) && found
|
||||
|
||||
if !found {
|
||||
this := cd.thisRange
|
||||
next := cd.nextRange
|
||||
fmt.Printf("Looking for: [%q, %q, %v] in this level.\n", this.left, this.right, this.inf)
|
||||
fmt.Printf("This Level:\n%s\n", thisLevel.debug())
|
||||
fmt.Println()
|
||||
fmt.Printf("Looking for: [%q, %q, %v] in next level.\n", next.left, next.right, next.inf)
|
||||
fmt.Printf("Next Level:\n%s\n", nextLevel.debug())
|
||||
log.Fatal("keyRange not found")
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,100 @@
|
|||
// +build !windows
|
||||
|
||||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// directoryLockGuard holds a lock on a directory and a pid file inside. The pid file isn't part
|
||||
// of the locking mechanism, it's just advisory.
|
||||
type directoryLockGuard struct {
|
||||
// File handle on the directory, which we've flocked.
|
||||
f *os.File
|
||||
// The absolute path to our pid file.
|
||||
path string
|
||||
// Was this a shared lock for a read-only database?
|
||||
readOnly bool
|
||||
}
|
||||
|
||||
// acquireDirectoryLock gets a lock on the directory (using flock). If
|
||||
// this is not read-only, it will also write our pid to
|
||||
// dirPath/pidFileName for convenience.
|
||||
func acquireDirectoryLock(dirPath string, pidFileName string, readOnly bool) (*directoryLockGuard, error) {
|
||||
// Convert to absolute path so that Release still works even if we do an unbalanced
|
||||
// chdir in the meantime.
|
||||
absPidFilePath, err := filepath.Abs(filepath.Join(dirPath, pidFileName))
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "cannot get absolute path for pid lock file")
|
||||
}
|
||||
f, err := os.Open(dirPath)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "cannot open directory %q", dirPath)
|
||||
}
|
||||
opts := unix.LOCK_EX | unix.LOCK_NB
|
||||
if readOnly {
|
||||
opts = unix.LOCK_SH | unix.LOCK_NB
|
||||
}
|
||||
|
||||
err = unix.Flock(int(f.Fd()), opts)
|
||||
if err != nil {
|
||||
f.Close()
|
||||
return nil, errors.Wrapf(err,
|
||||
"Cannot acquire directory lock on %q. Another process is using this Badger database.",
|
||||
dirPath)
|
||||
}
|
||||
|
||||
if !readOnly {
|
||||
// Yes, we happily overwrite a pre-existing pid file. We're the
|
||||
// only read-write badger process using this directory.
|
||||
err = ioutil.WriteFile(absPidFilePath, []byte(fmt.Sprintf("%d\n", os.Getpid())), 0666)
|
||||
if err != nil {
|
||||
f.Close()
|
||||
return nil, errors.Wrapf(err,
|
||||
"Cannot write pid file %q", absPidFilePath)
|
||||
}
|
||||
}
|
||||
return &directoryLockGuard{f, absPidFilePath, readOnly}, nil
|
||||
}
|
||||
|
||||
// Release deletes the pid file and releases our lock on the directory.
|
||||
func (guard *directoryLockGuard) release() error {
|
||||
var err error
|
||||
if !guard.readOnly {
|
||||
// It's important that we remove the pid file first.
|
||||
err = os.Remove(guard.path)
|
||||
}
|
||||
|
||||
if closeErr := guard.f.Close(); err == nil {
|
||||
err = closeErr
|
||||
}
|
||||
guard.path = ""
|
||||
guard.f = nil
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// openDir opens a directory for syncing.
|
||||
func openDir(path string) (*os.File, error) { return os.Open(path) }
|
|
@ -0,0 +1,94 @@
|
|||
// +build windows
|
||||
|
||||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
// OpenDir opens a directory in windows with write access for syncing.
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
func openDir(path string) (*os.File, error) {
|
||||
fd, err := openDirWin(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return os.NewFile(uintptr(fd), path), nil
|
||||
}
|
||||
|
||||
func openDirWin(path string) (fd syscall.Handle, err error) {
|
||||
if len(path) == 0 {
|
||||
return syscall.InvalidHandle, syscall.ERROR_FILE_NOT_FOUND
|
||||
}
|
||||
pathp, err := syscall.UTF16PtrFromString(path)
|
||||
if err != nil {
|
||||
return syscall.InvalidHandle, err
|
||||
}
|
||||
access := uint32(syscall.GENERIC_READ | syscall.GENERIC_WRITE)
|
||||
sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE)
|
||||
createmode := uint32(syscall.OPEN_EXISTING)
|
||||
fl := uint32(syscall.FILE_FLAG_BACKUP_SEMANTICS)
|
||||
return syscall.CreateFile(pathp, access, sharemode, nil, createmode, fl, 0)
|
||||
}
|
||||
|
||||
// DirectoryLockGuard holds a lock on the directory.
|
||||
type directoryLockGuard struct {
|
||||
path string
|
||||
}
|
||||
|
||||
// AcquireDirectoryLock acquires exclusive access to a directory.
|
||||
func acquireDirectoryLock(dirPath string, pidFileName string, readOnly bool) (*directoryLockGuard, error) {
|
||||
if readOnly {
|
||||
return nil, ErrWindowsNotSupported
|
||||
}
|
||||
|
||||
// Convert to absolute path so that Release still works even if we do an unbalanced
|
||||
// chdir in the meantime.
|
||||
absLockFilePath, err := filepath.Abs(filepath.Join(dirPath, pidFileName))
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Cannot get absolute path for pid lock file")
|
||||
}
|
||||
|
||||
f, err := os.OpenFile(absLockFilePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err,
|
||||
"Cannot create pid lock file %q. Another process is using this Badger database",
|
||||
absLockFilePath)
|
||||
}
|
||||
_, err = fmt.Fprintf(f, "%d\n", os.Getpid())
|
||||
closeErr := f.Close()
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Cannot write to pid lock file")
|
||||
}
|
||||
if closeErr != nil {
|
||||
return nil, errors.Wrap(closeErr, "Cannot close pid lock file")
|
||||
}
|
||||
return &directoryLockGuard{path: absLockFilePath}, nil
|
||||
}
|
||||
|
||||
// Release removes the directory lock.
|
||||
func (g *directoryLockGuard) release() error {
|
||||
path := g.path
|
||||
g.path = ""
|
||||
return os.Remove(path)
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
Package badger implements an embeddable, simple and fast key-value database,
|
||||
written in pure Go. It is designed to be highly performant for both reads and
|
||||
writes simultaneously. Badger uses Multi-Version Concurrency Control (MVCC), and
|
||||
supports transactions. It runs transactions concurrently, with serializable
|
||||
snapshot isolation guarantees.
|
||||
|
||||
Badger uses an LSM tree along with a value log to separate keys from values,
|
||||
hence reducing both write amplification and the size of the LSM tree. This
|
||||
allows LSM tree to be served entirely from RAM, while the values are served
|
||||
from SSD.
|
||||
|
||||
|
||||
Usage
|
||||
|
||||
Badger has the following main types: DB, Txn, Item and Iterator. DB contains
|
||||
keys that are associated with values. It must be opened with the appropriate
|
||||
options before it can be accessed.
|
||||
|
||||
All operations happen inside a Txn. Txn represents a transaction, which can
|
||||
be read-only or read-write. Read-only transactions can read values for a
|
||||
given key (which are returned inside an Item), or iterate over a set of
|
||||
key-value pairs using an Iterator (which are returned as Item type values as
|
||||
well). Read-write transactions can also update and delete keys from the DB.
|
||||
|
||||
See the examples for more usage details.
|
||||
*/
|
||||
package badger
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrValueLogSize is returned when opt.ValueLogFileSize option is not within the valid
|
||||
// range.
|
||||
ErrValueLogSize = errors.New("Invalid ValueLogFileSize, must be between 1MB and 2GB")
|
||||
|
||||
// ErrKeyNotFound is returned when key isn't found on a txn.Get.
|
||||
ErrKeyNotFound = errors.New("Key not found")
|
||||
|
||||
// ErrTxnTooBig is returned if too many writes are fit into a single transaction.
|
||||
ErrTxnTooBig = errors.New("Txn is too big to fit into one request")
|
||||
|
||||
// ErrConflict is returned when a transaction conflicts with another transaction. This can happen if
|
||||
// the read rows had been updated concurrently by another transaction.
|
||||
ErrConflict = errors.New("Transaction Conflict. Please retry")
|
||||
|
||||
// ErrReadOnlyTxn is returned if an update function is called on a read-only transaction.
|
||||
ErrReadOnlyTxn = errors.New("No sets or deletes are allowed in a read-only transaction")
|
||||
|
||||
// ErrDiscardedTxn is returned if a previously discarded transaction is re-used.
|
||||
ErrDiscardedTxn = errors.New("This transaction has been discarded. Create a new one")
|
||||
|
||||
// ErrEmptyKey is returned if an empty key is passed on an update function.
|
||||
ErrEmptyKey = errors.New("Key cannot be empty")
|
||||
|
||||
// ErrRetry is returned when a log file containing the value is not found.
|
||||
// This usually indicates that it may have been garbage collected, and the
|
||||
// operation needs to be retried.
|
||||
ErrRetry = errors.New("Unable to find log file. Please retry")
|
||||
|
||||
// ErrThresholdZero is returned if threshold is set to zero, and value log GC is called.
|
||||
// In such a case, GC can't be run.
|
||||
ErrThresholdZero = errors.New(
|
||||
"Value log GC can't run because threshold is set to zero")
|
||||
|
||||
// ErrNoRewrite is returned if a call for value log GC doesn't result in a log file rewrite.
|
||||
ErrNoRewrite = errors.New(
|
||||
"Value log GC attempt didn't result in any cleanup")
|
||||
|
||||
// ErrRejected is returned if a value log GC is called either while another GC is running, or
|
||||
// after DB::Close has been called.
|
||||
ErrRejected = errors.New("Value log GC request rejected")
|
||||
|
||||
// ErrInvalidRequest is returned if the user request is invalid.
|
||||
ErrInvalidRequest = errors.New("Invalid request")
|
||||
|
||||
// ErrManagedTxn is returned if the user tries to use an API which isn't
|
||||
// allowed due to external management of transactions, when using ManagedDB.
|
||||
ErrManagedTxn = errors.New(
|
||||
"Invalid API request. Not allowed to perform this action using ManagedDB")
|
||||
|
||||
// ErrInvalidDump if a data dump made previously cannot be loaded into the database.
|
||||
ErrInvalidDump = errors.New("Data dump cannot be read")
|
||||
|
||||
// ErrZeroBandwidth is returned if the user passes in zero bandwidth for sequence.
|
||||
ErrZeroBandwidth = errors.New("Bandwidth must be greater than zero")
|
||||
|
||||
// ErrInvalidLoadingMode is returned when opt.ValueLogLoadingMode option is not
|
||||
// within the valid range
|
||||
ErrInvalidLoadingMode = errors.New("Invalid ValueLogLoadingMode, must be FileIO or MemoryMap")
|
||||
|
||||
// ErrReplayNeeded is returned when opt.ReadOnly is set but the
|
||||
// database requires a value log replay.
|
||||
ErrReplayNeeded = errors.New("Database was not properly closed, cannot open read-only")
|
||||
|
||||
// ErrWindowsNotSupported is returned when opt.ReadOnly is used on Windows
|
||||
ErrWindowsNotSupported = errors.New("Read-only mode is not supported on Windows")
|
||||
|
||||
// ErrTruncateNeeded is returned when the value log gets corrupt, and requires truncation of
|
||||
// corrupt data to allow Badger to run properly.
|
||||
ErrTruncateNeeded = errors.New("Data corruption detected. Value log truncate required to run DB. This would result in data loss.")
|
||||
)
|
||||
|
||||
// Key length can't be more than uint16, as determined by table::header.
|
||||
const maxKeySize = 1<<16 - 8 // 8 bytes are for storing timestamp
|
||||
|
||||
func exceedsMaxKeySizeError(key []byte) error {
|
||||
return errors.Errorf("Key with size %d exceeded %d limit. Key:\n%s",
|
||||
len(key), maxKeySize, hex.Dump(key[:1<<10]))
|
||||
}
|
||||
|
||||
func exceedsMaxValueSizeError(value []byte, maxValueSize int64) error {
|
||||
return errors.Errorf("Value with size %d exceeded ValueLogFileSize (%d). Key:\n%s",
|
||||
len(value), maxValueSize, hex.Dump(value[:1<<10]))
|
||||
}
|
|
@ -0,0 +1,549 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/dgraph-io/badger/options"
|
||||
|
||||
"github.com/dgraph-io/badger/y"
|
||||
farm "github.com/dgryski/go-farm"
|
||||
)
|
||||
|
||||
type prefetchStatus uint8
|
||||
|
||||
const (
|
||||
prefetched prefetchStatus = iota + 1
|
||||
)
|
||||
|
||||
// Item is returned during iteration. Both the Key() and Value() output is only valid until
|
||||
// iterator.Next() is called.
|
||||
type Item struct {
|
||||
status prefetchStatus
|
||||
err error
|
||||
wg sync.WaitGroup
|
||||
db *DB
|
||||
key []byte
|
||||
vptr []byte
|
||||
meta byte // We need to store meta to know about bitValuePointer.
|
||||
userMeta byte
|
||||
expiresAt uint64
|
||||
val []byte
|
||||
slice *y.Slice // Used only during prefetching.
|
||||
next *Item
|
||||
version uint64
|
||||
txn *Txn
|
||||
}
|
||||
|
||||
// String returns a string representation of Item
|
||||
func (item *Item) String() string {
|
||||
return fmt.Sprintf("key=%q, version=%d, meta=%x", item.Key(), item.Version(), item.meta)
|
||||
}
|
||||
|
||||
// Deprecated
|
||||
// ToString returns a string representation of Item
|
||||
func (item *Item) ToString() string {
|
||||
return item.String()
|
||||
}
|
||||
|
||||
// Key returns the key.
|
||||
//
|
||||
// Key is only valid as long as item is valid, or transaction is valid. If you need to use it
|
||||
// outside its validity, please use KeyCopy
|
||||
func (item *Item) Key() []byte {
|
||||
return item.key
|
||||
}
|
||||
|
||||
// KeyCopy returns a copy of the key of the item, writing it to dst slice.
|
||||
// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and
|
||||
// returned.
|
||||
func (item *Item) KeyCopy(dst []byte) []byte {
|
||||
return y.SafeCopy(dst, item.key)
|
||||
}
|
||||
|
||||
// Version returns the commit timestamp of the item.
|
||||
func (item *Item) Version() uint64 {
|
||||
return item.version
|
||||
}
|
||||
|
||||
// Value retrieves the value of the item from the value log.
|
||||
//
|
||||
// This method must be called within a transaction. Calling it outside a
|
||||
// transaction is considered undefined behavior. If an iterator is being used,
|
||||
// then Item.Value() is defined in the current iteration only, because items are
|
||||
// reused.
|
||||
//
|
||||
// If you need to use a value outside a transaction, please use Item.ValueCopy
|
||||
// instead, or copy it yourself. Value might change once discard or commit is called.
|
||||
// Use ValueCopy if you want to do a Set after Get.
|
||||
func (item *Item) Value() ([]byte, error) {
|
||||
item.wg.Wait()
|
||||
if item.status == prefetched {
|
||||
return item.val, item.err
|
||||
}
|
||||
buf, cb, err := item.yieldItemValue()
|
||||
if cb != nil {
|
||||
item.txn.callbacks = append(item.txn.callbacks, cb)
|
||||
}
|
||||
return buf, err
|
||||
}
|
||||
|
||||
// ValueCopy returns a copy of the value of the item from the value log, writing it to dst slice.
|
||||
// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and
|
||||
// returned. Tip: It might make sense to reuse the returned slice as dst argument for the next call.
|
||||
//
|
||||
// This function is useful in long running iterate/update transactions to avoid a write deadlock.
|
||||
// See Github issue: https://github.com/dgraph-io/badger/issues/315
|
||||
func (item *Item) ValueCopy(dst []byte) ([]byte, error) {
|
||||
item.wg.Wait()
|
||||
if item.status == prefetched {
|
||||
return y.SafeCopy(dst, item.val), item.err
|
||||
}
|
||||
buf, cb, err := item.yieldItemValue()
|
||||
defer runCallback(cb)
|
||||
return y.SafeCopy(dst, buf), err
|
||||
}
|
||||
|
||||
func (item *Item) hasValue() bool {
|
||||
if item.meta == 0 && item.vptr == nil {
|
||||
// key not found
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsDeletedOrExpired returns true if item contains deleted or expired value.
|
||||
func (item *Item) IsDeletedOrExpired() bool {
|
||||
return isDeletedOrExpired(item.meta, item.expiresAt)
|
||||
}
|
||||
|
||||
func (item *Item) yieldItemValue() ([]byte, func(), error) {
|
||||
if !item.hasValue() {
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
if item.slice == nil {
|
||||
item.slice = new(y.Slice)
|
||||
}
|
||||
|
||||
if (item.meta & bitValuePointer) == 0 {
|
||||
val := item.slice.Resize(len(item.vptr))
|
||||
copy(val, item.vptr)
|
||||
return val, nil, nil
|
||||
}
|
||||
|
||||
var vp valuePointer
|
||||
vp.Decode(item.vptr)
|
||||
return item.db.vlog.Read(vp, item.slice)
|
||||
}
|
||||
|
||||
func runCallback(cb func()) {
|
||||
if cb != nil {
|
||||
cb()
|
||||
}
|
||||
}
|
||||
|
||||
func (item *Item) prefetchValue() {
|
||||
val, cb, err := item.yieldItemValue()
|
||||
defer runCallback(cb)
|
||||
|
||||
item.err = err
|
||||
item.status = prefetched
|
||||
if val == nil {
|
||||
return
|
||||
}
|
||||
if item.db.opt.ValueLogLoadingMode == options.MemoryMap {
|
||||
buf := item.slice.Resize(len(val))
|
||||
copy(buf, val)
|
||||
item.val = buf
|
||||
} else {
|
||||
item.val = val
|
||||
}
|
||||
}
|
||||
|
||||
// EstimatedSize returns approximate size of the key-value pair.
|
||||
//
|
||||
// This can be called while iterating through a store to quickly estimate the
|
||||
// size of a range of key-value pairs (without fetching the corresponding
|
||||
// values).
|
||||
func (item *Item) EstimatedSize() int64 {
|
||||
if !item.hasValue() {
|
||||
return 0
|
||||
}
|
||||
if (item.meta & bitValuePointer) == 0 {
|
||||
return int64(len(item.key) + len(item.vptr))
|
||||
}
|
||||
var vp valuePointer
|
||||
vp.Decode(item.vptr)
|
||||
return int64(vp.Len) // includes key length.
|
||||
}
|
||||
|
||||
// UserMeta returns the userMeta set by the user. Typically, this byte, optionally set by the user
|
||||
// is used to interpret the value.
|
||||
func (item *Item) UserMeta() byte {
|
||||
return item.userMeta
|
||||
}
|
||||
|
||||
// ExpiresAt returns a Unix time value indicating when the item will be
|
||||
// considered expired. 0 indicates that the item will never expire.
|
||||
func (item *Item) ExpiresAt() uint64 {
|
||||
return item.expiresAt
|
||||
}
|
||||
|
||||
// TODO: Switch this to use linked list container in Go.
|
||||
type list struct {
|
||||
head *Item
|
||||
tail *Item
|
||||
}
|
||||
|
||||
func (l *list) push(i *Item) {
|
||||
i.next = nil
|
||||
if l.tail == nil {
|
||||
l.head = i
|
||||
l.tail = i
|
||||
return
|
||||
}
|
||||
l.tail.next = i
|
||||
l.tail = i
|
||||
}
|
||||
|
||||
func (l *list) pop() *Item {
|
||||
if l.head == nil {
|
||||
return nil
|
||||
}
|
||||
i := l.head
|
||||
if l.head == l.tail {
|
||||
l.tail = nil
|
||||
l.head = nil
|
||||
} else {
|
||||
l.head = i.next
|
||||
}
|
||||
i.next = nil
|
||||
return i
|
||||
}
|
||||
|
||||
// IteratorOptions is used to set options when iterating over Badger key-value
|
||||
// stores.
|
||||
//
|
||||
// This package provides DefaultIteratorOptions which contains options that
|
||||
// should work for most applications. Consider using that as a starting point
|
||||
// before customizing it for your own needs.
|
||||
type IteratorOptions struct {
|
||||
// Indicates whether we should prefetch values during iteration and store them.
|
||||
PrefetchValues bool
|
||||
// How many KV pairs to prefetch while iterating. Valid only if PrefetchValues is true.
|
||||
PrefetchSize int
|
||||
Reverse bool // Direction of iteration. False is forward, true is backward.
|
||||
AllVersions bool // Fetch all valid versions of the same key.
|
||||
}
|
||||
|
||||
// DefaultIteratorOptions contains default options when iterating over Badger key-value stores.
|
||||
var DefaultIteratorOptions = IteratorOptions{
|
||||
PrefetchValues: true,
|
||||
PrefetchSize: 100,
|
||||
Reverse: false,
|
||||
AllVersions: false,
|
||||
}
|
||||
|
||||
// Iterator helps iterating over the KV pairs in a lexicographically sorted order.
|
||||
type Iterator struct {
|
||||
iitr *y.MergeIterator
|
||||
txn *Txn
|
||||
readTs uint64
|
||||
|
||||
opt IteratorOptions
|
||||
item *Item
|
||||
data list
|
||||
waste list
|
||||
|
||||
lastKey []byte // Used to skip over multiple versions of the same key.
|
||||
}
|
||||
|
||||
// NewIterator returns a new iterator. Depending upon the options, either only keys, or both
|
||||
// key-value pairs would be fetched. The keys are returned in lexicographically sorted order.
|
||||
// Using prefetch is highly recommended if you're doing a long running iteration.
|
||||
// Avoid long running iterations in update transactions.
|
||||
func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
|
||||
tables, decr := txn.db.getMemTables()
|
||||
defer decr()
|
||||
txn.db.vlog.incrIteratorCount()
|
||||
var iters []y.Iterator
|
||||
if itr := txn.newPendingWritesIterator(opt.Reverse); itr != nil {
|
||||
iters = append(iters, itr)
|
||||
}
|
||||
for i := 0; i < len(tables); i++ {
|
||||
iters = append(iters, tables[i].NewUniIterator(opt.Reverse))
|
||||
}
|
||||
iters = txn.db.lc.appendIterators(iters, opt.Reverse) // This will increment references.
|
||||
res := &Iterator{
|
||||
txn: txn,
|
||||
iitr: y.NewMergeIterator(iters, opt.Reverse),
|
||||
opt: opt,
|
||||
readTs: txn.readTs,
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func (it *Iterator) newItem() *Item {
|
||||
item := it.waste.pop()
|
||||
if item == nil {
|
||||
item = &Item{slice: new(y.Slice), db: it.txn.db, txn: it.txn}
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
// Item returns pointer to the current key-value pair.
|
||||
// This item is only valid until it.Next() gets called.
|
||||
func (it *Iterator) Item() *Item {
|
||||
tx := it.txn
|
||||
if tx.update {
|
||||
// Track reads if this is an update txn.
|
||||
tx.reads = append(tx.reads, farm.Fingerprint64(it.item.Key()))
|
||||
}
|
||||
return it.item
|
||||
}
|
||||
|
||||
// Valid returns false when iteration is done.
|
||||
func (it *Iterator) Valid() bool { return it.item != nil }
|
||||
|
||||
// ValidForPrefix returns false when iteration is done
|
||||
// or when the current key is not prefixed by the specified prefix.
|
||||
func (it *Iterator) ValidForPrefix(prefix []byte) bool {
|
||||
return it.item != nil && bytes.HasPrefix(it.item.key, prefix)
|
||||
}
|
||||
|
||||
// Close would close the iterator. It is important to call this when you're done with iteration.
|
||||
func (it *Iterator) Close() {
|
||||
it.iitr.Close()
|
||||
|
||||
// It is important to wait for the fill goroutines to finish. Otherwise, we might leave zombie
|
||||
// goroutines behind, which are waiting to acquire file read locks after DB has been closed.
|
||||
waitFor := func(l list) {
|
||||
item := l.pop()
|
||||
for item != nil {
|
||||
item.wg.Wait()
|
||||
item = l.pop()
|
||||
}
|
||||
}
|
||||
waitFor(it.waste)
|
||||
waitFor(it.data)
|
||||
|
||||
// TODO: We could handle this error.
|
||||
_ = it.txn.db.vlog.decrIteratorCount()
|
||||
}
|
||||
|
||||
// Next would advance the iterator by one. Always check it.Valid() after a Next()
|
||||
// to ensure you have access to a valid it.Item().
|
||||
func (it *Iterator) Next() {
|
||||
// Reuse current item
|
||||
it.item.wg.Wait() // Just cleaner to wait before pushing to avoid doing ref counting.
|
||||
it.waste.push(it.item)
|
||||
|
||||
// Set next item to current
|
||||
it.item = it.data.pop()
|
||||
|
||||
for it.iitr.Valid() {
|
||||
if it.parseItem() {
|
||||
// parseItem calls one extra next.
|
||||
// This is used to deal with the complexity of reverse iteration.
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func isDeletedOrExpired(meta byte, expiresAt uint64) bool {
|
||||
if meta&bitDelete > 0 {
|
||||
return true
|
||||
}
|
||||
if expiresAt == 0 {
|
||||
return false
|
||||
}
|
||||
return expiresAt <= uint64(time.Now().Unix())
|
||||
}
|
||||
|
||||
// parseItem is a complex function because it needs to handle both forward and reverse iteration
|
||||
// implementation. We store keys such that their versions are sorted in descending order. This makes
|
||||
// forward iteration efficient, but revese iteration complicated. This tradeoff is better because
|
||||
// forward iteration is more common than reverse.
|
||||
//
|
||||
// This function advances the iterator.
|
||||
func (it *Iterator) parseItem() bool {
|
||||
mi := it.iitr
|
||||
key := mi.Key()
|
||||
|
||||
setItem := func(item *Item) {
|
||||
if it.item == nil {
|
||||
it.item = item
|
||||
} else {
|
||||
it.data.push(item)
|
||||
}
|
||||
}
|
||||
|
||||
// Skip badger keys.
|
||||
if bytes.HasPrefix(key, badgerPrefix) {
|
||||
mi.Next()
|
||||
return false
|
||||
}
|
||||
|
||||
// Skip any versions which are beyond the readTs.
|
||||
version := y.ParseTs(key)
|
||||
if version > it.readTs {
|
||||
mi.Next()
|
||||
return false
|
||||
}
|
||||
|
||||
if it.opt.AllVersions {
|
||||
// Return deleted or expired values also, otherwise user can't figure out
|
||||
// whether the key was deleted.
|
||||
item := it.newItem()
|
||||
it.fill(item)
|
||||
setItem(item)
|
||||
mi.Next()
|
||||
return true
|
||||
}
|
||||
|
||||
// If iterating in forward direction, then just checking the last key against current key would
|
||||
// be sufficient.
|
||||
if !it.opt.Reverse {
|
||||
if y.SameKey(it.lastKey, key) {
|
||||
mi.Next()
|
||||
return false
|
||||
}
|
||||
// Only track in forward direction.
|
||||
// We should update lastKey as soon as we find a different key in our snapshot.
|
||||
// Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
|
||||
// Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
|
||||
// which is wrong. Therefore, update lastKey here.
|
||||
it.lastKey = y.SafeCopy(it.lastKey, mi.Key())
|
||||
}
|
||||
|
||||
FILL:
|
||||
// If deleted, advance and return.
|
||||
vs := mi.Value()
|
||||
if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
|
||||
mi.Next()
|
||||
return false
|
||||
}
|
||||
|
||||
item := it.newItem()
|
||||
it.fill(item)
|
||||
// fill item based on current cursor position. All Next calls have returned, so reaching here
|
||||
// means no Next was called.
|
||||
|
||||
mi.Next() // Advance but no fill item yet.
|
||||
if !it.opt.Reverse || !mi.Valid() { // Forward direction, or invalid.
|
||||
setItem(item)
|
||||
return true
|
||||
}
|
||||
|
||||
// Reverse direction.
|
||||
nextTs := y.ParseTs(mi.Key())
|
||||
mik := y.ParseKey(mi.Key())
|
||||
if nextTs <= it.readTs && bytes.Equal(mik, item.key) {
|
||||
// This is a valid potential candidate.
|
||||
goto FILL
|
||||
}
|
||||
// Ignore the next candidate. Return the current one.
|
||||
setItem(item)
|
||||
return true
|
||||
}
|
||||
|
||||
func (it *Iterator) fill(item *Item) {
|
||||
vs := it.iitr.Value()
|
||||
item.meta = vs.Meta
|
||||
item.userMeta = vs.UserMeta
|
||||
item.expiresAt = vs.ExpiresAt
|
||||
|
||||
item.version = y.ParseTs(it.iitr.Key())
|
||||
item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key()))
|
||||
|
||||
item.vptr = y.SafeCopy(item.vptr, vs.Value)
|
||||
item.val = nil
|
||||
if it.opt.PrefetchValues {
|
||||
item.wg.Add(1)
|
||||
go func() {
|
||||
// FIXME we are not handling errors here.
|
||||
item.prefetchValue()
|
||||
item.wg.Done()
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func (it *Iterator) prefetch() {
|
||||
prefetchSize := 2
|
||||
if it.opt.PrefetchValues && it.opt.PrefetchSize > 1 {
|
||||
prefetchSize = it.opt.PrefetchSize
|
||||
}
|
||||
|
||||
i := it.iitr
|
||||
var count int
|
||||
it.item = nil
|
||||
for i.Valid() {
|
||||
if !it.parseItem() {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
if count == prefetchSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Seek would seek to the provided key if present. If absent, it would seek to the next smallest key
|
||||
// greater than provided if iterating in the forward direction. Behavior would be reversed is
|
||||
// iterating backwards.
|
||||
func (it *Iterator) Seek(key []byte) {
|
||||
for i := it.data.pop(); i != nil; i = it.data.pop() {
|
||||
i.wg.Wait()
|
||||
it.waste.push(i)
|
||||
}
|
||||
|
||||
it.lastKey = it.lastKey[:0]
|
||||
if len(key) == 0 {
|
||||
it.iitr.Rewind()
|
||||
it.prefetch()
|
||||
return
|
||||
}
|
||||
|
||||
if !it.opt.Reverse {
|
||||
key = y.KeyWithTs(key, it.txn.readTs)
|
||||
} else {
|
||||
key = y.KeyWithTs(key, 0)
|
||||
}
|
||||
it.iitr.Seek(key)
|
||||
it.prefetch()
|
||||
}
|
||||
|
||||
// Rewind would rewind the iterator cursor all the way to zero-th position, which would be the
|
||||
// smallest key if iterating forward, and largest if iterating backward. It does not keep track of
|
||||
// whether the cursor started with a Seek().
|
||||
func (it *Iterator) Rewind() {
|
||||
i := it.data.pop()
|
||||
for i != nil {
|
||||
i.wg.Wait() // Just cleaner to wait before pushing. No ref counting needed.
|
||||
it.waste.push(i)
|
||||
i = it.data.pop()
|
||||
}
|
||||
|
||||
it.lastKey = it.lastKey[:0]
|
||||
it.iitr.Rewind()
|
||||
it.prefetch()
|
||||
}
|
|
@ -0,0 +1,294 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/dgraph-io/badger/table"
|
||||
"github.com/dgraph-io/badger/y"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
type levelHandler struct {
|
||||
// Guards tables, totalSize.
|
||||
sync.RWMutex
|
||||
|
||||
// For level >= 1, tables are sorted by key ranges, which do not overlap.
|
||||
// For level 0, tables are sorted by time.
|
||||
// For level 0, newest table are at the back. Compact the oldest one first, which is at the front.
|
||||
tables []*table.Table
|
||||
totalSize int64
|
||||
|
||||
// The following are initialized once and const.
|
||||
level int
|
||||
strLevel string
|
||||
maxTotalSize int64
|
||||
db *DB
|
||||
}
|
||||
|
||||
func (s *levelHandler) getTotalSize() int64 {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return s.totalSize
|
||||
}
|
||||
|
||||
// initTables replaces s.tables with given tables. This is done during loading.
|
||||
func (s *levelHandler) initTables(tables []*table.Table) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
s.tables = tables
|
||||
s.totalSize = 0
|
||||
for _, t := range tables {
|
||||
s.totalSize += t.Size()
|
||||
}
|
||||
|
||||
if s.level == 0 {
|
||||
// Key range will overlap. Just sort by fileID in ascending order
|
||||
// because newer tables are at the end of level 0.
|
||||
sort.Slice(s.tables, func(i, j int) bool {
|
||||
return s.tables[i].ID() < s.tables[j].ID()
|
||||
})
|
||||
} else {
|
||||
// Sort tables by keys.
|
||||
sort.Slice(s.tables, func(i, j int) bool {
|
||||
return y.CompareKeys(s.tables[i].Smallest(), s.tables[j].Smallest()) < 0
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// deleteTables remove tables idx0, ..., idx1-1.
|
||||
func (s *levelHandler) deleteTables(toDel []*table.Table) error {
|
||||
s.Lock() // s.Unlock() below
|
||||
|
||||
toDelMap := make(map[uint64]struct{})
|
||||
for _, t := range toDel {
|
||||
toDelMap[t.ID()] = struct{}{}
|
||||
}
|
||||
|
||||
// Make a copy as iterators might be keeping a slice of tables.
|
||||
var newTables []*table.Table
|
||||
for _, t := range s.tables {
|
||||
_, found := toDelMap[t.ID()]
|
||||
if !found {
|
||||
newTables = append(newTables, t)
|
||||
continue
|
||||
}
|
||||
s.totalSize -= t.Size()
|
||||
}
|
||||
s.tables = newTables
|
||||
|
||||
s.Unlock() // Unlock s _before_ we DecrRef our tables, which can be slow.
|
||||
|
||||
return decrRefs(toDel)
|
||||
}
|
||||
|
||||
// replaceTables will replace tables[left:right] with newTables. Note this EXCLUDES tables[right].
|
||||
// You must call decr() to delete the old tables _after_ writing the update to the manifest.
|
||||
func (s *levelHandler) replaceTables(newTables []*table.Table) error {
|
||||
// Need to re-search the range of tables in this level to be replaced as other goroutines might
|
||||
// be changing it as well. (They can't touch our tables, but if they add/remove other tables,
|
||||
// the indices get shifted around.)
|
||||
if len(newTables) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
s.Lock() // We s.Unlock() below.
|
||||
|
||||
// Increase totalSize first.
|
||||
for _, tbl := range newTables {
|
||||
s.totalSize += tbl.Size()
|
||||
tbl.IncrRef()
|
||||
}
|
||||
|
||||
kr := keyRange{
|
||||
left: newTables[0].Smallest(),
|
||||
right: newTables[len(newTables)-1].Biggest(),
|
||||
}
|
||||
left, right := s.overlappingTables(levelHandlerRLocked{}, kr)
|
||||
|
||||
toDecr := make([]*table.Table, right-left)
|
||||
// Update totalSize and reference counts.
|
||||
for i := left; i < right; i++ {
|
||||
tbl := s.tables[i]
|
||||
s.totalSize -= tbl.Size()
|
||||
toDecr[i-left] = tbl
|
||||
}
|
||||
|
||||
// To be safe, just make a copy. TODO: Be more careful and avoid copying.
|
||||
numDeleted := right - left
|
||||
numAdded := len(newTables)
|
||||
tables := make([]*table.Table, len(s.tables)-numDeleted+numAdded)
|
||||
y.AssertTrue(left == copy(tables, s.tables[:left]))
|
||||
t := tables[left:]
|
||||
y.AssertTrue(numAdded == copy(t, newTables))
|
||||
t = t[numAdded:]
|
||||
y.AssertTrue(len(s.tables[right:]) == copy(t, s.tables[right:]))
|
||||
s.tables = tables
|
||||
s.Unlock() // s.Unlock before we DecrRef tables -- that can be slow.
|
||||
return decrRefs(toDecr)
|
||||
}
|
||||
|
||||
func decrRefs(tables []*table.Table) error {
|
||||
for _, table := range tables {
|
||||
if err := table.DecrRef(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newLevelHandler(db *DB, level int) *levelHandler {
|
||||
return &levelHandler{
|
||||
level: level,
|
||||
strLevel: fmt.Sprintf("l%d", level),
|
||||
db: db,
|
||||
}
|
||||
}
|
||||
|
||||
// tryAddLevel0Table returns true if ok and no stalling.
|
||||
func (s *levelHandler) tryAddLevel0Table(t *table.Table) bool {
|
||||
y.AssertTrue(s.level == 0)
|
||||
// Need lock as we may be deleting the first table during a level 0 compaction.
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
if len(s.tables) >= s.db.opt.NumLevelZeroTablesStall {
|
||||
return false
|
||||
}
|
||||
|
||||
s.tables = append(s.tables, t)
|
||||
t.IncrRef()
|
||||
s.totalSize += t.Size()
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *levelHandler) numTables() int {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return len(s.tables)
|
||||
}
|
||||
|
||||
func (s *levelHandler) close() error {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
var err error
|
||||
for _, t := range s.tables {
|
||||
if closeErr := t.Close(); closeErr != nil && err == nil {
|
||||
err = closeErr
|
||||
}
|
||||
}
|
||||
return errors.Wrap(err, "levelHandler.close")
|
||||
}
|
||||
|
||||
// getTableForKey acquires a read-lock to access s.tables. It returns a list of tableHandlers.
|
||||
func (s *levelHandler) getTableForKey(key []byte) ([]*table.Table, func() error) {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
|
||||
if s.level == 0 {
|
||||
// For level 0, we need to check every table. Remember to make a copy as s.tables may change
|
||||
// once we exit this function, and we don't want to lock s.tables while seeking in tables.
|
||||
// CAUTION: Reverse the tables.
|
||||
out := make([]*table.Table, 0, len(s.tables))
|
||||
for i := len(s.tables) - 1; i >= 0; i-- {
|
||||
out = append(out, s.tables[i])
|
||||
s.tables[i].IncrRef()
|
||||
}
|
||||
return out, func() error {
|
||||
for _, t := range out {
|
||||
if err := t.DecrRef(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
// For level >= 1, we can do a binary search as key range does not overlap.
|
||||
idx := sort.Search(len(s.tables), func(i int) bool {
|
||||
return y.CompareKeys(s.tables[i].Biggest(), key) >= 0
|
||||
})
|
||||
if idx >= len(s.tables) {
|
||||
// Given key is strictly > than every element we have.
|
||||
return nil, func() error { return nil }
|
||||
}
|
||||
tbl := s.tables[idx]
|
||||
tbl.IncrRef()
|
||||
return []*table.Table{tbl}, tbl.DecrRef
|
||||
}
|
||||
|
||||
// get returns value for a given key or the key after that. If not found, return nil.
|
||||
func (s *levelHandler) get(key []byte) (y.ValueStruct, error) {
|
||||
tables, decr := s.getTableForKey(key)
|
||||
keyNoTs := y.ParseKey(key)
|
||||
|
||||
var maxVs y.ValueStruct
|
||||
for _, th := range tables {
|
||||
if th.DoesNotHave(keyNoTs) {
|
||||
y.NumLSMBloomHits.Add(s.strLevel, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
it := th.NewIterator(false)
|
||||
defer it.Close()
|
||||
|
||||
y.NumLSMGets.Add(s.strLevel, 1)
|
||||
it.Seek(key)
|
||||
if !it.Valid() {
|
||||
continue
|
||||
}
|
||||
if y.SameKey(key, it.Key()) {
|
||||
if version := y.ParseTs(it.Key()); maxVs.Version < version {
|
||||
maxVs = it.Value()
|
||||
maxVs.Version = version
|
||||
}
|
||||
}
|
||||
}
|
||||
return maxVs, decr()
|
||||
}
|
||||
|
||||
// appendIterators appends iterators to an array of iterators, for merging.
|
||||
// Note: This obtains references for the table handlers. Remember to close these iterators.
|
||||
func (s *levelHandler) appendIterators(iters []y.Iterator, reversed bool) []y.Iterator {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
|
||||
if s.level == 0 {
|
||||
// Remember to add in reverse order!
|
||||
// The newer table at the end of s.tables should be added first as it takes precedence.
|
||||
return appendIteratorsReversed(iters, s.tables, reversed)
|
||||
}
|
||||
return append(iters, table.NewConcatIterator(s.tables, reversed))
|
||||
}
|
||||
|
||||
type levelHandlerRLocked struct{}
|
||||
|
||||
// overlappingTables returns the tables that intersect with key range. Returns a half-interval.
|
||||
// This function should already have acquired a read lock, and this is so important the caller must
|
||||
// pass an empty parameter declaring such.
|
||||
func (s *levelHandler) overlappingTables(_ levelHandlerRLocked, kr keyRange) (int, int) {
|
||||
left := sort.Search(len(s.tables), func(i int) bool {
|
||||
return y.CompareKeys(kr.left, s.tables[i].Biggest()) <= 0
|
||||
})
|
||||
right := sort.Search(len(s.tables), func(i int) bool {
|
||||
return y.CompareKeys(kr.right, s.tables[i].Smallest()) < 0
|
||||
})
|
||||
return left, right
|
||||
}
|
|
@ -0,0 +1,706 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"os"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/trace"
|
||||
|
||||
"github.com/dgraph-io/badger/protos"
|
||||
"github.com/dgraph-io/badger/table"
|
||||
"github.com/dgraph-io/badger/y"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
type levelsController struct {
|
||||
nextFileID uint64 // Atomic
|
||||
elog trace.EventLog
|
||||
|
||||
// The following are initialized once and const.
|
||||
levels []*levelHandler
|
||||
kv *DB
|
||||
|
||||
cstatus compactStatus
|
||||
}
|
||||
|
||||
var (
|
||||
// This is for getting timings between stalls.
|
||||
lastUnstalled time.Time
|
||||
)
|
||||
|
||||
// revertToManifest checks that all necessary table files exist and removes all table files not
|
||||
// referenced by the manifest. idMap is a set of table file id's that were read from the directory
|
||||
// listing.
|
||||
func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error {
|
||||
// 1. Check all files in manifest exist.
|
||||
for id := range mf.Tables {
|
||||
if _, ok := idMap[id]; !ok {
|
||||
return fmt.Errorf("file does not exist for table %d", id)
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Delete files that shouldn't exist.
|
||||
for id := range idMap {
|
||||
if _, ok := mf.Tables[id]; !ok {
|
||||
kv.elog.Printf("Table file %d not referenced in MANIFEST\n", id)
|
||||
filename := table.NewFilename(id, kv.opt.Dir)
|
||||
if err := os.Remove(filename); err != nil {
|
||||
return y.Wrapf(err, "While removing table %d", id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func newLevelsController(kv *DB, mf *Manifest) (*levelsController, error) {
|
||||
y.AssertTrue(kv.opt.NumLevelZeroTablesStall > kv.opt.NumLevelZeroTables)
|
||||
s := &levelsController{
|
||||
kv: kv,
|
||||
elog: kv.elog,
|
||||
levels: make([]*levelHandler, kv.opt.MaxLevels),
|
||||
}
|
||||
s.cstatus.levels = make([]*levelCompactStatus, kv.opt.MaxLevels)
|
||||
|
||||
for i := 0; i < kv.opt.MaxLevels; i++ {
|
||||
s.levels[i] = newLevelHandler(kv, i)
|
||||
if i == 0 {
|
||||
// Do nothing.
|
||||
} else if i == 1 {
|
||||
// Level 1 probably shouldn't be too much bigger than level 0.
|
||||
s.levels[i].maxTotalSize = kv.opt.LevelOneSize
|
||||
} else {
|
||||
s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(kv.opt.LevelSizeMultiplier)
|
||||
}
|
||||
s.cstatus.levels[i] = new(levelCompactStatus)
|
||||
}
|
||||
|
||||
// Compare manifest against directory, check for existent/non-existent files, and remove.
|
||||
if err := revertToManifest(kv, mf, getIDMap(kv.opt.Dir)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Some files may be deleted. Let's reload.
|
||||
tables := make([][]*table.Table, kv.opt.MaxLevels)
|
||||
var maxFileID uint64
|
||||
for fileID, tableManifest := range mf.Tables {
|
||||
fname := table.NewFilename(fileID, kv.opt.Dir)
|
||||
var flags uint32 = y.Sync
|
||||
if kv.opt.ReadOnly {
|
||||
flags |= y.ReadOnly
|
||||
}
|
||||
fd, err := y.OpenExistingFile(fname, flags)
|
||||
if err != nil {
|
||||
closeAllTables(tables)
|
||||
return nil, errors.Wrapf(err, "Opening file: %q", fname)
|
||||
}
|
||||
|
||||
t, err := table.OpenTable(fd, kv.opt.TableLoadingMode)
|
||||
if err != nil {
|
||||
closeAllTables(tables)
|
||||
return nil, errors.Wrapf(err, "Opening table: %q", fname)
|
||||
}
|
||||
|
||||
level := tableManifest.Level
|
||||
tables[level] = append(tables[level], t)
|
||||
|
||||
if fileID > maxFileID {
|
||||
maxFileID = fileID
|
||||
}
|
||||
}
|
||||
s.nextFileID = maxFileID + 1
|
||||
for i, tbls := range tables {
|
||||
s.levels[i].initTables(tbls)
|
||||
}
|
||||
|
||||
// Make sure key ranges do not overlap etc.
|
||||
if err := s.validate(); err != nil {
|
||||
_ = s.cleanupLevels()
|
||||
return nil, errors.Wrap(err, "Level validation")
|
||||
}
|
||||
|
||||
// Sync directory (because we have at least removed some files, or previously created the
|
||||
// manifest file).
|
||||
if err := syncDir(kv.opt.Dir); err != nil {
|
||||
_ = s.close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Closes the tables, for cleanup in newLevelsController. (We Close() instead of using DecrRef()
|
||||
// because that would delete the underlying files.) We ignore errors, which is OK because tables
|
||||
// are read-only.
|
||||
func closeAllTables(tables [][]*table.Table) {
|
||||
for _, tableSlice := range tables {
|
||||
for _, table := range tableSlice {
|
||||
_ = table.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *levelsController) cleanupLevels() error {
|
||||
var firstErr error
|
||||
for _, l := range s.levels {
|
||||
if err := l.close(); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
return firstErr
|
||||
}
|
||||
|
||||
func (s *levelsController) startCompact(lc *y.Closer) {
|
||||
n := s.kv.opt.NumCompactors
|
||||
lc.AddRunning(n - 1)
|
||||
for i := 0; i < n; i++ {
|
||||
go s.runWorker(lc)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *levelsController) runWorker(lc *y.Closer) {
|
||||
defer lc.Done()
|
||||
if s.kv.opt.DoNotCompact {
|
||||
return
|
||||
}
|
||||
|
||||
time.Sleep(time.Duration(rand.Int31n(1000)) * time.Millisecond)
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
// Can add a done channel or other stuff.
|
||||
case <-ticker.C:
|
||||
prios := s.pickCompactLevels()
|
||||
for _, p := range prios {
|
||||
// TODO: Handle error.
|
||||
didCompact, _ := s.doCompact(p)
|
||||
if didCompact {
|
||||
break
|
||||
}
|
||||
}
|
||||
case <-lc.HasBeenClosed():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if level zero may be compacted, without accounting for compactions that already
|
||||
// might be happening.
|
||||
func (s *levelsController) isLevel0Compactable() bool {
|
||||
return s.levels[0].numTables() >= s.kv.opt.NumLevelZeroTables
|
||||
}
|
||||
|
||||
// Returns true if the non-zero level may be compacted. delSize provides the size of the tables
|
||||
// which are currently being compacted so that we treat them as already having started being
|
||||
// compacted (because they have been, yet their size is already counted in getTotalSize).
|
||||
func (l *levelHandler) isCompactable(delSize int64) bool {
|
||||
return l.getTotalSize()-delSize >= l.maxTotalSize
|
||||
}
|
||||
|
||||
type compactionPriority struct {
|
||||
level int
|
||||
score float64
|
||||
}
|
||||
|
||||
// pickCompactLevel determines which level to compact.
|
||||
// Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
|
||||
func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
|
||||
// This function must use identical criteria for guaranteeing compaction's progress that
|
||||
// addLevel0Table uses.
|
||||
|
||||
// cstatus is checked to see if level 0's tables are already being compacted
|
||||
if !s.cstatus.overlapsWith(0, infRange) && s.isLevel0Compactable() {
|
||||
pri := compactionPriority{
|
||||
level: 0,
|
||||
score: float64(s.levels[0].numTables()) / float64(s.kv.opt.NumLevelZeroTables),
|
||||
}
|
||||
prios = append(prios, pri)
|
||||
}
|
||||
|
||||
for i, l := range s.levels[1:] {
|
||||
// Don't consider those tables that are already being compacted right now.
|
||||
delSize := s.cstatus.delSize(i + 1)
|
||||
|
||||
if l.isCompactable(delSize) {
|
||||
pri := compactionPriority{
|
||||
level: i + 1,
|
||||
score: float64(l.getTotalSize()-delSize) / float64(l.maxTotalSize),
|
||||
}
|
||||
prios = append(prios, pri)
|
||||
}
|
||||
}
|
||||
sort.Slice(prios, func(i, j int) bool {
|
||||
return prios[i].score > prios[j].score
|
||||
})
|
||||
return prios
|
||||
}
|
||||
|
||||
// compactBuildTables merge topTables and botTables to form a list of new tables.
|
||||
func (s *levelsController) compactBuildTables(
|
||||
l int, cd compactDef) ([]*table.Table, func() error, error) {
|
||||
topTables := cd.top
|
||||
botTables := cd.bot
|
||||
|
||||
// Create iterators across all the tables involved first.
|
||||
var iters []y.Iterator
|
||||
if l == 0 {
|
||||
iters = appendIteratorsReversed(iters, topTables, false)
|
||||
} else {
|
||||
y.AssertTrue(len(topTables) == 1)
|
||||
iters = []y.Iterator{topTables[0].NewIterator(false)}
|
||||
}
|
||||
|
||||
// Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap.
|
||||
iters = append(iters, table.NewConcatIterator(botTables, false))
|
||||
it := y.NewMergeIterator(iters, false)
|
||||
defer it.Close() // Important to close the iterator to do ref counting.
|
||||
|
||||
it.Rewind()
|
||||
|
||||
// Start generating new tables.
|
||||
type newTableResult struct {
|
||||
table *table.Table
|
||||
err error
|
||||
}
|
||||
resultCh := make(chan newTableResult)
|
||||
var i int
|
||||
for ; it.Valid(); i++ {
|
||||
timeStart := time.Now()
|
||||
builder := table.NewTableBuilder()
|
||||
for ; it.Valid(); it.Next() {
|
||||
if builder.ReachedCapacity(s.kv.opt.MaxTableSize) {
|
||||
break
|
||||
}
|
||||
y.Check(builder.Add(it.Key(), it.Value()))
|
||||
}
|
||||
// It was true that it.Valid() at least once in the loop above, which means we
|
||||
// called Add() at least once, and builder is not Empty().
|
||||
y.AssertTrue(!builder.Empty())
|
||||
|
||||
cd.elog.LazyPrintf("LOG Compact. Iteration to generate one table took: %v\n", time.Since(timeStart))
|
||||
|
||||
fileID := s.reserveFileID()
|
||||
go func(builder *table.Builder) {
|
||||
defer builder.Close()
|
||||
|
||||
fd, err := y.CreateSyncedFile(table.NewFilename(fileID, s.kv.opt.Dir), true)
|
||||
if err != nil {
|
||||
resultCh <- newTableResult{nil, errors.Wrapf(err, "While opening new table: %d", fileID)}
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := fd.Write(builder.Finish()); err != nil {
|
||||
resultCh <- newTableResult{nil, errors.Wrapf(err, "Unable to write to file: %d", fileID)}
|
||||
return
|
||||
}
|
||||
|
||||
tbl, err := table.OpenTable(fd, s.kv.opt.TableLoadingMode)
|
||||
// decrRef is added below.
|
||||
resultCh <- newTableResult{tbl, errors.Wrapf(err, "Unable to open table: %q", fd.Name())}
|
||||
}(builder)
|
||||
}
|
||||
|
||||
newTables := make([]*table.Table, 0, 20)
|
||||
|
||||
// Wait for all table builders to finish.
|
||||
var firstErr error
|
||||
for x := 0; x < i; x++ {
|
||||
res := <-resultCh
|
||||
newTables = append(newTables, res.table)
|
||||
if firstErr == nil {
|
||||
firstErr = res.err
|
||||
}
|
||||
}
|
||||
|
||||
if firstErr == nil {
|
||||
// Ensure created files' directory entries are visible. We don't mind the extra latency
|
||||
// from not doing this ASAP after all file creation has finished because this is a
|
||||
// background operation.
|
||||
firstErr = syncDir(s.kv.opt.Dir)
|
||||
}
|
||||
|
||||
if firstErr != nil {
|
||||
// An error happened. Delete all the newly created table files (by calling DecrRef
|
||||
// -- we're the only holders of a ref).
|
||||
for j := 0; j < i; j++ {
|
||||
if newTables[j] != nil {
|
||||
newTables[j].DecrRef()
|
||||
}
|
||||
}
|
||||
errorReturn := errors.Wrapf(firstErr, "While running compaction for: %+v", cd)
|
||||
return nil, nil, errorReturn
|
||||
}
|
||||
|
||||
sort.Slice(newTables, func(i, j int) bool {
|
||||
return y.CompareKeys(newTables[i].Biggest(), newTables[j].Biggest()) < 0
|
||||
})
|
||||
|
||||
return newTables, func() error { return decrRefs(newTables) }, nil
|
||||
}
|
||||
|
||||
func buildChangeSet(cd *compactDef, newTables []*table.Table) protos.ManifestChangeSet {
|
||||
changes := []*protos.ManifestChange{}
|
||||
for _, table := range newTables {
|
||||
changes = append(changes, makeTableCreateChange(table.ID(), cd.nextLevel.level))
|
||||
}
|
||||
for _, table := range cd.top {
|
||||
changes = append(changes, makeTableDeleteChange(table.ID()))
|
||||
}
|
||||
for _, table := range cd.bot {
|
||||
changes = append(changes, makeTableDeleteChange(table.ID()))
|
||||
}
|
||||
return protos.ManifestChangeSet{Changes: changes}
|
||||
}
|
||||
|
||||
type compactDef struct {
|
||||
elog trace.Trace
|
||||
|
||||
thisLevel *levelHandler
|
||||
nextLevel *levelHandler
|
||||
|
||||
top []*table.Table
|
||||
bot []*table.Table
|
||||
|
||||
thisRange keyRange
|
||||
nextRange keyRange
|
||||
|
||||
thisSize int64
|
||||
}
|
||||
|
||||
func (cd *compactDef) lockLevels() {
|
||||
cd.thisLevel.RLock()
|
||||
cd.nextLevel.RLock()
|
||||
}
|
||||
|
||||
func (cd *compactDef) unlockLevels() {
|
||||
cd.nextLevel.RUnlock()
|
||||
cd.thisLevel.RUnlock()
|
||||
}
|
||||
|
||||
func (s *levelsController) fillTablesL0(cd *compactDef) bool {
|
||||
cd.lockLevels()
|
||||
defer cd.unlockLevels()
|
||||
|
||||
cd.top = make([]*table.Table, len(cd.thisLevel.tables))
|
||||
copy(cd.top, cd.thisLevel.tables)
|
||||
if len(cd.top) == 0 {
|
||||
return false
|
||||
}
|
||||
cd.thisRange = infRange
|
||||
|
||||
kr := getKeyRange(cd.top)
|
||||
left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, kr)
|
||||
cd.bot = make([]*table.Table, right-left)
|
||||
copy(cd.bot, cd.nextLevel.tables[left:right])
|
||||
|
||||
if len(cd.bot) == 0 {
|
||||
cd.nextRange = kr
|
||||
} else {
|
||||
cd.nextRange = getKeyRange(cd.bot)
|
||||
}
|
||||
|
||||
if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *levelsController) fillTables(cd *compactDef) bool {
|
||||
cd.lockLevels()
|
||||
defer cd.unlockLevels()
|
||||
|
||||
tbls := make([]*table.Table, len(cd.thisLevel.tables))
|
||||
copy(tbls, cd.thisLevel.tables)
|
||||
if len(tbls) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Find the biggest table, and compact that first.
|
||||
// TODO: Try other table picking strategies.
|
||||
sort.Slice(tbls, func(i, j int) bool {
|
||||
return tbls[i].Size() > tbls[j].Size()
|
||||
})
|
||||
|
||||
for _, t := range tbls {
|
||||
cd.thisSize = t.Size()
|
||||
cd.thisRange = keyRange{
|
||||
left: t.Smallest(),
|
||||
right: t.Biggest(),
|
||||
}
|
||||
if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) {
|
||||
continue
|
||||
}
|
||||
cd.top = []*table.Table{t}
|
||||
left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange)
|
||||
|
||||
cd.bot = make([]*table.Table, right-left)
|
||||
copy(cd.bot, cd.nextLevel.tables[left:right])
|
||||
|
||||
if len(cd.bot) == 0 {
|
||||
cd.bot = []*table.Table{}
|
||||
cd.nextRange = cd.thisRange
|
||||
if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
|
||||
continue
|
||||
}
|
||||
return true
|
||||
}
|
||||
cd.nextRange = getKeyRange(cd.bot)
|
||||
|
||||
if s.cstatus.overlapsWith(cd.nextLevel.level, cd.nextRange) {
|
||||
continue
|
||||
}
|
||||
|
||||
if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
|
||||
continue
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *levelsController) runCompactDef(l int, cd compactDef) (err error) {
|
||||
timeStart := time.Now()
|
||||
|
||||
thisLevel := cd.thisLevel
|
||||
nextLevel := cd.nextLevel
|
||||
|
||||
if thisLevel.level >= 1 && len(cd.bot) == 0 {
|
||||
y.AssertTrue(len(cd.top) == 1)
|
||||
tbl := cd.top[0]
|
||||
|
||||
// We write to the manifest _before_ we delete files (and after we created files).
|
||||
changes := []*protos.ManifestChange{
|
||||
// The order matters here -- you can't temporarily have two copies of the same
|
||||
// table id when reloading the manifest.
|
||||
makeTableDeleteChange(tbl.ID()),
|
||||
makeTableCreateChange(tbl.ID(), nextLevel.level),
|
||||
}
|
||||
if err := s.kv.manifest.addChanges(changes); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// We have to add to nextLevel before we remove from thisLevel, not after. This way, we
|
||||
// don't have a bug where reads would see keys missing from both levels.
|
||||
|
||||
// Note: It's critical that we add tables (replace them) in nextLevel before deleting them
|
||||
// in thisLevel. (We could finagle it atomically somehow.) Also, when reading we must
|
||||
// read, or at least acquire s.RLock(), in increasing order by level, so that we don't skip
|
||||
// a compaction.
|
||||
|
||||
if err := nextLevel.replaceTables(cd.top); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := thisLevel.deleteTables(cd.top); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cd.elog.LazyPrintf("\tLOG Compact-Move %d->%d smallest:%s biggest:%s took %v\n",
|
||||
l, l+1, string(tbl.Smallest()), string(tbl.Biggest()), time.Since(timeStart))
|
||||
return nil
|
||||
}
|
||||
|
||||
newTables, decr, err := s.compactBuildTables(l, cd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() {
|
||||
// Only assign to err, if it's not already nil.
|
||||
if decErr := decr(); err == nil {
|
||||
err = decErr
|
||||
}
|
||||
}()
|
||||
changeSet := buildChangeSet(&cd, newTables)
|
||||
|
||||
// We write to the manifest _before_ we delete files (and after we created files)
|
||||
if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// See comment earlier in this function about the ordering of these ops, and the order in which
|
||||
// we access levels when reading.
|
||||
if err := nextLevel.replaceTables(newTables); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := thisLevel.deleteTables(cd.top); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Note: For level 0, while doCompact is running, it is possible that new tables are added.
|
||||
// However, the tables are added only to the end, so it is ok to just delete the first table.
|
||||
|
||||
cd.elog.LazyPrintf("LOG Compact %d->%d, del %d tables, add %d tables, took %v\n",
|
||||
l, l+1, len(cd.top)+len(cd.bot), len(newTables), time.Since(timeStart))
|
||||
return nil
|
||||
}
|
||||
|
||||
// doCompact picks some table on level l and compacts it away to the next level.
|
||||
func (s *levelsController) doCompact(p compactionPriority) (bool, error) {
|
||||
l := p.level
|
||||
y.AssertTrue(l+1 < s.kv.opt.MaxLevels) // Sanity check.
|
||||
|
||||
cd := compactDef{
|
||||
elog: trace.New("Badger", "Compact"),
|
||||
thisLevel: s.levels[l],
|
||||
nextLevel: s.levels[l+1],
|
||||
}
|
||||
cd.elog.SetMaxEvents(100)
|
||||
defer cd.elog.Finish()
|
||||
|
||||
cd.elog.LazyPrintf("Got compaction priority: %+v", p)
|
||||
|
||||
// While picking tables to be compacted, both levels' tables are expected to
|
||||
// remain unchanged.
|
||||
if l == 0 {
|
||||
if !s.fillTablesL0(&cd) {
|
||||
cd.elog.LazyPrintf("fillTables failed for level: %d\n", l)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
} else {
|
||||
if !s.fillTables(&cd) {
|
||||
cd.elog.LazyPrintf("fillTables failed for level: %d\n", l)
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
defer s.cstatus.delete(cd) // Remove the ranges from compaction status.
|
||||
|
||||
cd.elog.LazyPrintf("Running for level: %d\n", cd.thisLevel.level)
|
||||
s.cstatus.toLog(cd.elog)
|
||||
if err := s.runCompactDef(l, cd); err != nil {
|
||||
// This compaction couldn't be done successfully.
|
||||
cd.elog.LazyPrintf("\tLOG Compact FAILED with error: %+v: %+v", err, cd)
|
||||
return false, err
|
||||
}
|
||||
|
||||
s.cstatus.toLog(cd.elog)
|
||||
cd.elog.LazyPrintf("Compaction for level: %d DONE", cd.thisLevel.level)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (s *levelsController) addLevel0Table(t *table.Table) error {
|
||||
// We update the manifest _before_ the table becomes part of a levelHandler, because at that
|
||||
// point it could get used in some compaction. This ensures the manifest file gets updated in
|
||||
// the proper order. (That means this update happens before that of some compaction which
|
||||
// deletes the table.)
|
||||
err := s.kv.manifest.addChanges([]*protos.ManifestChange{
|
||||
makeTableCreateChange(t.ID(), 0),
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for !s.levels[0].tryAddLevel0Table(t) {
|
||||
// Stall. Make sure all levels are healthy before we unstall.
|
||||
var timeStart time.Time
|
||||
{
|
||||
s.elog.Printf("STALLED STALLED STALLED STALLED STALLED STALLED STALLED STALLED: %v\n",
|
||||
time.Since(lastUnstalled))
|
||||
s.cstatus.RLock()
|
||||
for i := 0; i < s.kv.opt.MaxLevels; i++ {
|
||||
s.elog.Printf("level=%d. Status=%s Size=%d\n",
|
||||
i, s.cstatus.levels[i].debug(), s.levels[i].getTotalSize())
|
||||
}
|
||||
s.cstatus.RUnlock()
|
||||
timeStart = time.Now()
|
||||
}
|
||||
// Before we unstall, we need to make sure that level 0 and 1 are healthy. Otherwise, we
|
||||
// will very quickly fill up level 0 again and if the compaction strategy favors level 0,
|
||||
// then level 1 is going to super full.
|
||||
for i := 0; ; i++ {
|
||||
// Passing 0 for delSize to compactable means we're treating incomplete compactions as
|
||||
// not having finished -- we wait for them to finish. Also, it's crucial this behavior
|
||||
// replicates pickCompactLevels' behavior in computing compactability in order to
|
||||
// guarantee progress.
|
||||
if !s.isLevel0Compactable() && !s.levels[1].isCompactable(0) {
|
||||
break
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
if i%100 == 0 {
|
||||
prios := s.pickCompactLevels()
|
||||
s.elog.Printf("Waiting to add level 0 table. Compaction priorities: %+v\n", prios)
|
||||
i = 0
|
||||
}
|
||||
}
|
||||
{
|
||||
s.elog.Printf("UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED: %v\n",
|
||||
time.Since(timeStart))
|
||||
lastUnstalled = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *levelsController) close() error {
|
||||
err := s.cleanupLevels()
|
||||
return errors.Wrap(err, "levelsController.Close")
|
||||
}
|
||||
|
||||
// get returns the found value if any. If not found, we return nil.
|
||||
func (s *levelsController) get(key []byte, maxVs y.ValueStruct) (y.ValueStruct, error) {
|
||||
// It's important that we iterate the levels from 0 on upward. The reason is, if we iterated
|
||||
// in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could
|
||||
// read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do
|
||||
// parallelize this, we will need to call the h.RLock() function by increasing order of level
|
||||
// number.)
|
||||
|
||||
version := y.ParseTs(key)
|
||||
for _, h := range s.levels {
|
||||
vs, err := h.get(key) // Calls h.RLock() and h.RUnlock().
|
||||
if err != nil {
|
||||
return y.ValueStruct{}, errors.Wrapf(err, "get key: %q", key)
|
||||
}
|
||||
if vs.Value == nil && vs.Meta == 0 {
|
||||
continue
|
||||
}
|
||||
if vs.Version == version {
|
||||
return vs, nil
|
||||
}
|
||||
if maxVs.Version < vs.Version {
|
||||
maxVs = vs
|
||||
}
|
||||
}
|
||||
return maxVs, nil
|
||||
}
|
||||
|
||||
func appendIteratorsReversed(out []y.Iterator, th []*table.Table, reversed bool) []y.Iterator {
|
||||
for i := len(th) - 1; i >= 0; i-- {
|
||||
// This will increment the reference of the table handler.
|
||||
out = append(out, th[i].NewIterator(reversed))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// appendIterators appends iterators to an array of iterators, for merging.
|
||||
// Note: This obtains references for the table handlers. Remember to close these iterators.
|
||||
func (s *levelsController) appendIterators(
|
||||
iters []y.Iterator, reversed bool) []y.Iterator {
|
||||
// Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing
|
||||
// data when there's a compaction.
|
||||
for _, level := range s.levels {
|
||||
iters = level.appendIterators(iters, reversed)
|
||||
}
|
||||
return iters
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
// ManagedDB allows end users to manage the transactions themselves. Transaction
|
||||
// start and commit timestamps are set by end-user.
|
||||
//
|
||||
// This is only useful for databases built on top of Badger (like Dgraph), and
|
||||
// can be ignored by most users.
|
||||
//
|
||||
// WARNING: This is an experimental feature and may be changed significantly in
|
||||
// a future release. So please proceed with caution.
|
||||
type ManagedDB struct {
|
||||
*DB
|
||||
}
|
||||
|
||||
// OpenManaged returns a new ManagedDB, which allows more control over setting
|
||||
// transaction timestamps.
|
||||
//
|
||||
// This is only useful for databases built on top of Badger (like Dgraph), and
|
||||
// can be ignored by most users.
|
||||
func OpenManaged(opts Options) (*ManagedDB, error) {
|
||||
opts.managedTxns = true
|
||||
db, err := Open(opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &ManagedDB{db}, nil
|
||||
}
|
||||
|
||||
// NewTransaction overrides DB.NewTransaction() and panics when invoked. Use
|
||||
// NewTransactionAt() instead.
|
||||
func (db *ManagedDB) NewTransaction(update bool) {
|
||||
panic("Cannot use NewTransaction() for ManagedDB. Use NewTransactionAt() instead.")
|
||||
}
|
||||
|
||||
// NewTransactionAt follows the same logic as DB.NewTransaction(), but uses the
|
||||
// provided read timestamp.
|
||||
//
|
||||
// This is only useful for databases built on top of Badger (like Dgraph), and
|
||||
// can be ignored by most users.
|
||||
func (db *ManagedDB) NewTransactionAt(readTs uint64, update bool) *Txn {
|
||||
txn := db.DB.NewTransaction(update)
|
||||
txn.readTs = readTs
|
||||
return txn
|
||||
}
|
||||
|
||||
// CommitAt commits the transaction, following the same logic as Commit(), but
|
||||
// at the given commit timestamp. This will panic if not used with ManagedDB.
|
||||
//
|
||||
// This is only useful for databases built on top of Badger (like Dgraph), and
|
||||
// can be ignored by most users.
|
||||
func (txn *Txn) CommitAt(commitTs uint64, callback func(error)) error {
|
||||
if !txn.db.opt.managedTxns {
|
||||
return ErrManagedTxn
|
||||
}
|
||||
txn.commitTs = commitTs
|
||||
return txn.Commit(callback)
|
||||
}
|
||||
|
||||
// PurgeVersionsBelow will delete all versions of a key below the specified version
|
||||
func (db *ManagedDB) PurgeVersionsBelow(key []byte, ts uint64) error {
|
||||
txn := db.NewTransactionAt(ts, false)
|
||||
defer txn.Discard()
|
||||
return db.purgeVersionsBelow(txn, key, ts)
|
||||
}
|
||||
|
||||
// GetSequence is not supported on ManagedDB. Calling this would result
|
||||
// in a panic.
|
||||
func (db *ManagedDB) GetSequence(_ []byte, _ uint64) (*Sequence, error) {
|
||||
panic("Cannot use GetSequence for ManagedDB.")
|
||||
}
|
|
@ -0,0 +1,433 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"hash/crc32"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
|
||||
"github.com/dgraph-io/badger/protos"
|
||||
"github.com/dgraph-io/badger/y"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// Manifest represents the contents of the MANIFEST file in a Badger store.
|
||||
//
|
||||
// The MANIFEST file describes the startup state of the db -- all LSM files and what level they're
|
||||
// at.
|
||||
//
|
||||
// It consists of a sequence of ManifestChangeSet objects. Each of these is treated atomically,
|
||||
// and contains a sequence of ManifestChange's (file creations/deletions) which we use to
|
||||
// reconstruct the manifest at startup.
|
||||
type Manifest struct {
|
||||
Levels []levelManifest
|
||||
Tables map[uint64]tableManifest
|
||||
|
||||
// Contains total number of creation and deletion changes in the manifest -- used to compute
|
||||
// whether it'd be useful to rewrite the manifest.
|
||||
Creations int
|
||||
Deletions int
|
||||
}
|
||||
|
||||
func createManifest() Manifest {
|
||||
levels := make([]levelManifest, 0)
|
||||
return Manifest{
|
||||
Levels: levels,
|
||||
Tables: make(map[uint64]tableManifest),
|
||||
}
|
||||
}
|
||||
|
||||
// levelManifest contains information about LSM tree levels
|
||||
// in the MANIFEST file.
|
||||
type levelManifest struct {
|
||||
Tables map[uint64]struct{} // Set of table id's
|
||||
}
|
||||
|
||||
// tableManifest contains information about a specific level
|
||||
// in the LSM tree.
|
||||
type tableManifest struct {
|
||||
Level uint8
|
||||
}
|
||||
|
||||
// manifestFile holds the file pointer (and other info) about the manifest file, which is a log
|
||||
// file we append to.
|
||||
type manifestFile struct {
|
||||
fp *os.File
|
||||
directory string
|
||||
// We make this configurable so that unit tests can hit rewrite() code quickly
|
||||
deletionsRewriteThreshold int
|
||||
|
||||
// Guards appends, which includes access to the manifest field.
|
||||
appendLock sync.Mutex
|
||||
|
||||
// Used to track the current state of the manifest, used when rewriting.
|
||||
manifest Manifest
|
||||
}
|
||||
|
||||
const (
|
||||
// ManifestFilename is the filename for the manifest file.
|
||||
ManifestFilename = "MANIFEST"
|
||||
manifestRewriteFilename = "MANIFEST-REWRITE"
|
||||
manifestDeletionsRewriteThreshold = 10000
|
||||
manifestDeletionsRatio = 10
|
||||
)
|
||||
|
||||
// asChanges returns a sequence of changes that could be used to recreate the Manifest in its
|
||||
// present state.
|
||||
func (m *Manifest) asChanges() []*protos.ManifestChange {
|
||||
changes := make([]*protos.ManifestChange, 0, len(m.Tables))
|
||||
for id, tm := range m.Tables {
|
||||
changes = append(changes, makeTableCreateChange(id, int(tm.Level)))
|
||||
}
|
||||
return changes
|
||||
}
|
||||
|
||||
func (m *Manifest) clone() Manifest {
|
||||
changeSet := protos.ManifestChangeSet{Changes: m.asChanges()}
|
||||
ret := createManifest()
|
||||
y.Check(applyChangeSet(&ret, &changeSet))
|
||||
return ret
|
||||
}
|
||||
|
||||
// openOrCreateManifestFile opens a Badger manifest file if it exists, or creates on if
|
||||
// one doesn’t.
|
||||
func openOrCreateManifestFile(dir string, readOnly bool) (ret *manifestFile, result Manifest, err error) {
|
||||
return helpOpenOrCreateManifestFile(dir, readOnly, manifestDeletionsRewriteThreshold)
|
||||
}
|
||||
|
||||
func helpOpenOrCreateManifestFile(dir string, readOnly bool, deletionsThreshold int) (ret *manifestFile, result Manifest, err error) {
|
||||
path := filepath.Join(dir, ManifestFilename)
|
||||
var flags uint32
|
||||
if readOnly {
|
||||
flags |= y.ReadOnly
|
||||
}
|
||||
fp, err := y.OpenExistingFile(path, flags) // We explicitly sync in addChanges, outside the lock.
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
return nil, Manifest{}, err
|
||||
}
|
||||
if readOnly {
|
||||
return nil, Manifest{}, fmt.Errorf("no manifest found, required for read-only db")
|
||||
}
|
||||
m := createManifest()
|
||||
fp, netCreations, err := helpRewrite(dir, &m)
|
||||
if err != nil {
|
||||
return nil, Manifest{}, err
|
||||
}
|
||||
y.AssertTrue(netCreations == 0)
|
||||
mf := &manifestFile{
|
||||
fp: fp,
|
||||
directory: dir,
|
||||
manifest: m.clone(),
|
||||
deletionsRewriteThreshold: deletionsThreshold,
|
||||
}
|
||||
return mf, m, nil
|
||||
}
|
||||
|
||||
manifest, truncOffset, err := ReplayManifestFile(fp)
|
||||
if err != nil {
|
||||
_ = fp.Close()
|
||||
return nil, Manifest{}, err
|
||||
}
|
||||
|
||||
if !readOnly {
|
||||
// Truncate file so we don't have a half-written entry at the end.
|
||||
if err := fp.Truncate(truncOffset); err != nil {
|
||||
_ = fp.Close()
|
||||
return nil, Manifest{}, err
|
||||
}
|
||||
}
|
||||
if _, err = fp.Seek(0, io.SeekEnd); err != nil {
|
||||
_ = fp.Close()
|
||||
return nil, Manifest{}, err
|
||||
}
|
||||
|
||||
mf := &manifestFile{
|
||||
fp: fp,
|
||||
directory: dir,
|
||||
manifest: manifest.clone(),
|
||||
deletionsRewriteThreshold: deletionsThreshold,
|
||||
}
|
||||
return mf, manifest, nil
|
||||
}
|
||||
|
||||
func (mf *manifestFile) close() error {
|
||||
return mf.fp.Close()
|
||||
}
|
||||
|
||||
// addChanges writes a batch of changes, atomically, to the file. By "atomically" that means when
|
||||
// we replay the MANIFEST file, we'll either replay all the changes or none of them. (The truth of
|
||||
// this depends on the filesystem -- some might append garbage data if a system crash happens at
|
||||
// the wrong time.)
|
||||
func (mf *manifestFile) addChanges(changesParam []*protos.ManifestChange) error {
|
||||
changes := protos.ManifestChangeSet{Changes: changesParam}
|
||||
buf, err := changes.Marshal()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Maybe we could use O_APPEND instead (on certain file systems)
|
||||
mf.appendLock.Lock()
|
||||
if err := applyChangeSet(&mf.manifest, &changes); err != nil {
|
||||
mf.appendLock.Unlock()
|
||||
return err
|
||||
}
|
||||
// Rewrite manifest if it'd shrink by 1/10 and it's big enough to care
|
||||
if mf.manifest.Deletions > mf.deletionsRewriteThreshold &&
|
||||
mf.manifest.Deletions > manifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) {
|
||||
if err := mf.rewrite(); err != nil {
|
||||
mf.appendLock.Unlock()
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
var lenCrcBuf [8]byte
|
||||
binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf)))
|
||||
binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, y.CastagnoliCrcTable))
|
||||
buf = append(lenCrcBuf[:], buf...)
|
||||
if _, err := mf.fp.Write(buf); err != nil {
|
||||
mf.appendLock.Unlock()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
mf.appendLock.Unlock()
|
||||
return mf.fp.Sync()
|
||||
}
|
||||
|
||||
// Has to be 4 bytes. The value can never change, ever, anyway.
|
||||
var magicText = [4]byte{'B', 'd', 'g', 'r'}
|
||||
|
||||
// The magic version number.
|
||||
const magicVersion = 4
|
||||
|
||||
func helpRewrite(dir string, m *Manifest) (*os.File, int, error) {
|
||||
rewritePath := filepath.Join(dir, manifestRewriteFilename)
|
||||
// We explicitly sync.
|
||||
fp, err := y.OpenTruncFile(rewritePath, false)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
buf := make([]byte, 8)
|
||||
copy(buf[0:4], magicText[:])
|
||||
binary.BigEndian.PutUint32(buf[4:8], magicVersion)
|
||||
|
||||
netCreations := len(m.Tables)
|
||||
changes := m.asChanges()
|
||||
set := protos.ManifestChangeSet{Changes: changes}
|
||||
|
||||
changeBuf, err := set.Marshal()
|
||||
if err != nil {
|
||||
fp.Close()
|
||||
return nil, 0, err
|
||||
}
|
||||
var lenCrcBuf [8]byte
|
||||
binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf)))
|
||||
binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, y.CastagnoliCrcTable))
|
||||
buf = append(buf, lenCrcBuf[:]...)
|
||||
buf = append(buf, changeBuf...)
|
||||
if _, err := fp.Write(buf); err != nil {
|
||||
fp.Close()
|
||||
return nil, 0, err
|
||||
}
|
||||
if err := fp.Sync(); err != nil {
|
||||
fp.Close()
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// In Windows the files should be closed before doing a Rename.
|
||||
if err = fp.Close(); err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
manifestPath := filepath.Join(dir, ManifestFilename)
|
||||
if err := os.Rename(rewritePath, manifestPath); err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
fp, err = y.OpenExistingFile(manifestPath, 0)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
if _, err := fp.Seek(0, io.SeekEnd); err != nil {
|
||||
fp.Close()
|
||||
return nil, 0, err
|
||||
}
|
||||
if err := syncDir(dir); err != nil {
|
||||
fp.Close()
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
return fp, netCreations, nil
|
||||
}
|
||||
|
||||
// Must be called while appendLock is held.
|
||||
func (mf *manifestFile) rewrite() error {
|
||||
// In Windows the files should be closed before doing a Rename.
|
||||
if err := mf.fp.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
fp, netCreations, err := helpRewrite(mf.directory, &mf.manifest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mf.fp = fp
|
||||
mf.manifest.Creations = netCreations
|
||||
mf.manifest.Deletions = 0
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type countingReader struct {
|
||||
wrapped *bufio.Reader
|
||||
count int64
|
||||
}
|
||||
|
||||
func (r *countingReader) Read(p []byte) (n int, err error) {
|
||||
n, err = r.wrapped.Read(p)
|
||||
r.count += int64(n)
|
||||
return
|
||||
}
|
||||
|
||||
func (r *countingReader) ReadByte() (b byte, err error) {
|
||||
b, err = r.wrapped.ReadByte()
|
||||
if err == nil {
|
||||
r.count++
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
errBadMagic = errors.New("manifest has bad magic")
|
||||
)
|
||||
|
||||
// ReplayManifestFile reads the manifest file and constructs two manifest objects. (We need one
|
||||
// immutable copy and one mutable copy of the manifest. Easiest way is to construct two of them.)
|
||||
// Also, returns the last offset after a completely read manifest entry -- the file must be
|
||||
// truncated at that point before further appends are made (if there is a partial entry after
|
||||
// that). In normal conditions, truncOffset is the file size.
|
||||
func ReplayManifestFile(fp *os.File) (ret Manifest, truncOffset int64, err error) {
|
||||
r := countingReader{wrapped: bufio.NewReader(fp)}
|
||||
|
||||
var magicBuf [8]byte
|
||||
if _, err := io.ReadFull(&r, magicBuf[:]); err != nil {
|
||||
return Manifest{}, 0, errBadMagic
|
||||
}
|
||||
if !bytes.Equal(magicBuf[0:4], magicText[:]) {
|
||||
return Manifest{}, 0, errBadMagic
|
||||
}
|
||||
version := binary.BigEndian.Uint32(magicBuf[4:8])
|
||||
if version != magicVersion {
|
||||
return Manifest{}, 0,
|
||||
fmt.Errorf("manifest has unsupported version: %d (we support %d)", version, magicVersion)
|
||||
}
|
||||
|
||||
build := createManifest()
|
||||
var offset int64
|
||||
for {
|
||||
offset = r.count
|
||||
var lenCrcBuf [8]byte
|
||||
_, err := io.ReadFull(&r, lenCrcBuf[:])
|
||||
if err != nil {
|
||||
if err == io.EOF || err == io.ErrUnexpectedEOF {
|
||||
break
|
||||
}
|
||||
return Manifest{}, 0, err
|
||||
}
|
||||
length := binary.BigEndian.Uint32(lenCrcBuf[0:4])
|
||||
var buf = make([]byte, length)
|
||||
if _, err := io.ReadFull(&r, buf); err != nil {
|
||||
if err == io.EOF || err == io.ErrUnexpectedEOF {
|
||||
break
|
||||
}
|
||||
return Manifest{}, 0, err
|
||||
}
|
||||
if crc32.Checksum(buf, y.CastagnoliCrcTable) != binary.BigEndian.Uint32(lenCrcBuf[4:8]) {
|
||||
break
|
||||
}
|
||||
|
||||
var changeSet protos.ManifestChangeSet
|
||||
if err := changeSet.Unmarshal(buf); err != nil {
|
||||
return Manifest{}, 0, err
|
||||
}
|
||||
|
||||
if err := applyChangeSet(&build, &changeSet); err != nil {
|
||||
return Manifest{}, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return build, offset, err
|
||||
}
|
||||
|
||||
func applyManifestChange(build *Manifest, tc *protos.ManifestChange) error {
|
||||
switch tc.Op {
|
||||
case protos.ManifestChange_CREATE:
|
||||
if _, ok := build.Tables[tc.Id]; ok {
|
||||
return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id)
|
||||
}
|
||||
build.Tables[tc.Id] = tableManifest{
|
||||
Level: uint8(tc.Level),
|
||||
}
|
||||
for len(build.Levels) <= int(tc.Level) {
|
||||
build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})})
|
||||
}
|
||||
build.Levels[tc.Level].Tables[tc.Id] = struct{}{}
|
||||
build.Creations++
|
||||
case protos.ManifestChange_DELETE:
|
||||
tm, ok := build.Tables[tc.Id]
|
||||
if !ok {
|
||||
return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id)
|
||||
}
|
||||
delete(build.Levels[tm.Level].Tables, tc.Id)
|
||||
delete(build.Tables, tc.Id)
|
||||
build.Deletions++
|
||||
default:
|
||||
return fmt.Errorf("MANIFEST file has invalid manifestChange op")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is
|
||||
// just plain broken.
|
||||
func applyChangeSet(build *Manifest, changeSet *protos.ManifestChangeSet) error {
|
||||
for _, change := range changeSet.Changes {
|
||||
if err := applyManifestChange(build, change); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func makeTableCreateChange(id uint64, level int) *protos.ManifestChange {
|
||||
return &protos.ManifestChange{
|
||||
Id: id,
|
||||
Op: protos.ManifestChange_CREATE,
|
||||
Level: uint32(level),
|
||||
}
|
||||
}
|
||||
|
||||
func makeTableDeleteChange(id uint64) *protos.ManifestChange {
|
||||
return &protos.ManifestChange{
|
||||
Id: id,
|
||||
Op: protos.ManifestChange_DELETE,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"github.com/dgraph-io/badger/options"
|
||||
)
|
||||
|
||||
// NOTE: Keep the comments in the following to 75 chars width, so they
|
||||
// format nicely in godoc.
|
||||
|
||||
// Options are params for creating DB object.
|
||||
//
|
||||
// This package provides DefaultOptions which contains options that should
|
||||
// work for most applications. Consider using that as a starting point before
|
||||
// customizing it for your own needs.
|
||||
type Options struct {
|
||||
// 1. Mandatory flags
|
||||
// -------------------
|
||||
// Directory to store the data in. Should exist and be writable.
|
||||
Dir string
|
||||
// Directory to store the value log in. Can be the same as Dir. Should
|
||||
// exist and be writable.
|
||||
ValueDir string
|
||||
|
||||
// 2. Frequently modified flags
|
||||
// -----------------------------
|
||||
// Sync all writes to disk. Setting this to true would slow down data
|
||||
// loading significantly.
|
||||
SyncWrites bool
|
||||
|
||||
// How should LSM tree be accessed.
|
||||
TableLoadingMode options.FileLoadingMode
|
||||
|
||||
// How should value log be accessed
|
||||
ValueLogLoadingMode options.FileLoadingMode
|
||||
|
||||
// 3. Flags that user might want to review
|
||||
// ----------------------------------------
|
||||
// The following affect all levels of LSM tree.
|
||||
MaxTableSize int64 // Each table (or file) is at most this size.
|
||||
LevelSizeMultiplier int // Equals SizeOf(Li+1)/SizeOf(Li).
|
||||
MaxLevels int // Maximum number of levels of compaction.
|
||||
// If value size >= this threshold, only store value offsets in tree.
|
||||
ValueThreshold int
|
||||
// Maximum number of tables to keep in memory, before stalling.
|
||||
NumMemtables int
|
||||
// The following affect how we handle LSM tree L0.
|
||||
// Maximum number of Level 0 tables before we start compacting.
|
||||
NumLevelZeroTables int
|
||||
|
||||
// If we hit this number of Level 0 tables, we will stall until L0 is
|
||||
// compacted away.
|
||||
NumLevelZeroTablesStall int
|
||||
|
||||
// Maximum total size for L1.
|
||||
LevelOneSize int64
|
||||
|
||||
// Size of single value log file.
|
||||
ValueLogFileSize int64
|
||||
|
||||
// Number of compaction workers to run concurrently.
|
||||
NumCompactors int
|
||||
|
||||
// Transaction start and commit timestamps are manaVgedTxns by end-user. This
|
||||
// is a private option used by ManagedDB.
|
||||
managedTxns bool
|
||||
|
||||
// 4. Flags for testing purposes
|
||||
// ------------------------------
|
||||
DoNotCompact bool // Stops LSM tree from compactions.
|
||||
|
||||
maxBatchCount int64 // max entries in batch
|
||||
maxBatchSize int64 // max batch size in bytes
|
||||
|
||||
// Open the DB as read-only. With this set, multiple processes can
|
||||
// open the same Badger DB. Note: if the DB being opened had crashed
|
||||
// before and has vlog data to be replayed, ReadOnly will cause Open
|
||||
// to fail with an appropriate message.
|
||||
ReadOnly bool
|
||||
|
||||
// Truncate value log to delete corrupt data, if any. Would not truncate if ReadOnly is set.
|
||||
Truncate bool
|
||||
}
|
||||
|
||||
// DefaultOptions sets a list of recommended options for good performance.
|
||||
// Feel free to modify these to suit your needs.
|
||||
var DefaultOptions = Options{
|
||||
DoNotCompact: false,
|
||||
LevelOneSize: 256 << 20,
|
||||
LevelSizeMultiplier: 10,
|
||||
TableLoadingMode: options.LoadToRAM,
|
||||
ValueLogLoadingMode: options.MemoryMap,
|
||||
// table.MemoryMap to mmap() the tables.
|
||||
// table.Nothing to not preload the tables.
|
||||
MaxLevels: 7,
|
||||
MaxTableSize: 64 << 20,
|
||||
NumCompactors: 3,
|
||||
NumLevelZeroTables: 5,
|
||||
NumLevelZeroTablesStall: 10,
|
||||
NumMemtables: 5,
|
||||
SyncWrites: true,
|
||||
// Nothing to read/write value log using standard File I/O
|
||||
// MemoryMap to mmap() the value log files
|
||||
ValueLogFileSize: 1 << 30,
|
||||
ValueThreshold: 20,
|
||||
Truncate: false,
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package options
|
||||
|
||||
// FileLoadingMode specifies how data in LSM table files and value log files should
|
||||
// be loaded.
|
||||
type FileLoadingMode int
|
||||
|
||||
const (
|
||||
// FileIO indicates that files must be loaded using standard I/O
|
||||
FileIO FileLoadingMode = iota
|
||||
// LoadToRAM indicates that file must be loaded into RAM
|
||||
LoadToRAM
|
||||
// MemoryMap indicates that that the file must be memory-mapped
|
||||
MemoryMap
|
||||
)
|
|
@ -0,0 +1,497 @@
|
|||
// Code generated by protoc-gen-gogo. DO NOT EDIT.
|
||||
// source: backup.proto
|
||||
|
||||
/*
|
||||
Package protos is a generated protocol buffer package.
|
||||
|
||||
It is generated from these files:
|
||||
backup.proto
|
||||
manifest.proto
|
||||
|
||||
It has these top-level messages:
|
||||
KVPair
|
||||
ManifestChangeSet
|
||||
ManifestChange
|
||||
*/
|
||||
package protos
|
||||
|
||||
import proto "github.com/golang/protobuf/proto"
|
||||
import fmt "fmt"
|
||||
import math "math"
|
||||
|
||||
import io "io"
|
||||
|
||||
// Reference imports to suppress errors if they are not otherwise used.
|
||||
var _ = proto.Marshal
|
||||
var _ = fmt.Errorf
|
||||
var _ = math.Inf
|
||||
|
||||
// This is a compile-time assertion to ensure that this generated file
|
||||
// is compatible with the proto package it is being compiled against.
|
||||
// A compilation error at this line likely means your copy of the
|
||||
// proto package needs to be updated.
|
||||
const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package
|
||||
|
||||
type KVPair struct {
|
||||
Key []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"`
|
||||
Value []byte `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"`
|
||||
UserMeta []byte `protobuf:"bytes,3,opt,name=userMeta,proto3" json:"userMeta,omitempty"`
|
||||
Version uint64 `protobuf:"varint,4,opt,name=version,proto3" json:"version,omitempty"`
|
||||
ExpiresAt uint64 `protobuf:"varint,5,opt,name=expires_at,json=expiresAt,proto3" json:"expires_at,omitempty"`
|
||||
}
|
||||
|
||||
func (m *KVPair) Reset() { *m = KVPair{} }
|
||||
func (m *KVPair) String() string { return proto.CompactTextString(m) }
|
||||
func (*KVPair) ProtoMessage() {}
|
||||
func (*KVPair) Descriptor() ([]byte, []int) { return fileDescriptorBackup, []int{0} }
|
||||
|
||||
func (m *KVPair) GetKey() []byte {
|
||||
if m != nil {
|
||||
return m.Key
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *KVPair) GetValue() []byte {
|
||||
if m != nil {
|
||||
return m.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *KVPair) GetUserMeta() []byte {
|
||||
if m != nil {
|
||||
return m.UserMeta
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *KVPair) GetVersion() uint64 {
|
||||
if m != nil {
|
||||
return m.Version
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (m *KVPair) GetExpiresAt() uint64 {
|
||||
if m != nil {
|
||||
return m.ExpiresAt
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func init() {
|
||||
proto.RegisterType((*KVPair)(nil), "protos.KVPair")
|
||||
}
|
||||
func (m *KVPair) Marshal() (dAtA []byte, err error) {
|
||||
size := m.Size()
|
||||
dAtA = make([]byte, size)
|
||||
n, err := m.MarshalTo(dAtA)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return dAtA[:n], nil
|
||||
}
|
||||
|
||||
func (m *KVPair) MarshalTo(dAtA []byte) (int, error) {
|
||||
var i int
|
||||
_ = i
|
||||
var l int
|
||||
_ = l
|
||||
if len(m.Key) > 0 {
|
||||
dAtA[i] = 0xa
|
||||
i++
|
||||
i = encodeVarintBackup(dAtA, i, uint64(len(m.Key)))
|
||||
i += copy(dAtA[i:], m.Key)
|
||||
}
|
||||
if len(m.Value) > 0 {
|
||||
dAtA[i] = 0x12
|
||||
i++
|
||||
i = encodeVarintBackup(dAtA, i, uint64(len(m.Value)))
|
||||
i += copy(dAtA[i:], m.Value)
|
||||
}
|
||||
if len(m.UserMeta) > 0 {
|
||||
dAtA[i] = 0x1a
|
||||
i++
|
||||
i = encodeVarintBackup(dAtA, i, uint64(len(m.UserMeta)))
|
||||
i += copy(dAtA[i:], m.UserMeta)
|
||||
}
|
||||
if m.Version != 0 {
|
||||
dAtA[i] = 0x20
|
||||
i++
|
||||
i = encodeVarintBackup(dAtA, i, uint64(m.Version))
|
||||
}
|
||||
if m.ExpiresAt != 0 {
|
||||
dAtA[i] = 0x28
|
||||
i++
|
||||
i = encodeVarintBackup(dAtA, i, uint64(m.ExpiresAt))
|
||||
}
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func encodeFixed64Backup(dAtA []byte, offset int, v uint64) int {
|
||||
dAtA[offset] = uint8(v)
|
||||
dAtA[offset+1] = uint8(v >> 8)
|
||||
dAtA[offset+2] = uint8(v >> 16)
|
||||
dAtA[offset+3] = uint8(v >> 24)
|
||||
dAtA[offset+4] = uint8(v >> 32)
|
||||
dAtA[offset+5] = uint8(v >> 40)
|
||||
dAtA[offset+6] = uint8(v >> 48)
|
||||
dAtA[offset+7] = uint8(v >> 56)
|
||||
return offset + 8
|
||||
}
|
||||
func encodeFixed32Backup(dAtA []byte, offset int, v uint32) int {
|
||||
dAtA[offset] = uint8(v)
|
||||
dAtA[offset+1] = uint8(v >> 8)
|
||||
dAtA[offset+2] = uint8(v >> 16)
|
||||
dAtA[offset+3] = uint8(v >> 24)
|
||||
return offset + 4
|
||||
}
|
||||
func encodeVarintBackup(dAtA []byte, offset int, v uint64) int {
|
||||
for v >= 1<<7 {
|
||||
dAtA[offset] = uint8(v&0x7f | 0x80)
|
||||
v >>= 7
|
||||
offset++
|
||||
}
|
||||
dAtA[offset] = uint8(v)
|
||||
return offset + 1
|
||||
}
|
||||
func (m *KVPair) Size() (n int) {
|
||||
var l int
|
||||
_ = l
|
||||
l = len(m.Key)
|
||||
if l > 0 {
|
||||
n += 1 + l + sovBackup(uint64(l))
|
||||
}
|
||||
l = len(m.Value)
|
||||
if l > 0 {
|
||||
n += 1 + l + sovBackup(uint64(l))
|
||||
}
|
||||
l = len(m.UserMeta)
|
||||
if l > 0 {
|
||||
n += 1 + l + sovBackup(uint64(l))
|
||||
}
|
||||
if m.Version != 0 {
|
||||
n += 1 + sovBackup(uint64(m.Version))
|
||||
}
|
||||
if m.ExpiresAt != 0 {
|
||||
n += 1 + sovBackup(uint64(m.ExpiresAt))
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func sovBackup(x uint64) (n int) {
|
||||
for {
|
||||
n++
|
||||
x >>= 7
|
||||
if x == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
func sozBackup(x uint64) (n int) {
|
||||
return sovBackup(uint64((x << 1) ^ uint64((int64(x) >> 63))))
|
||||
}
|
||||
func (m *KVPair) Unmarshal(dAtA []byte) error {
|
||||
l := len(dAtA)
|
||||
iNdEx := 0
|
||||
for iNdEx < l {
|
||||
preIndex := iNdEx
|
||||
var wire uint64
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
wire |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
fieldNum := int32(wire >> 3)
|
||||
wireType := int(wire & 0x7)
|
||||
if wireType == 4 {
|
||||
return fmt.Errorf("proto: KVPair: wiretype end group for non-group")
|
||||
}
|
||||
if fieldNum <= 0 {
|
||||
return fmt.Errorf("proto: KVPair: illegal tag %d (wire type %d)", fieldNum, wire)
|
||||
}
|
||||
switch fieldNum {
|
||||
case 1:
|
||||
if wireType != 2 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
|
||||
}
|
||||
var byteLen int
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
byteLen |= (int(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
if byteLen < 0 {
|
||||
return ErrInvalidLengthBackup
|
||||
}
|
||||
postIndex := iNdEx + byteLen
|
||||
if postIndex > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...)
|
||||
if m.Key == nil {
|
||||
m.Key = []byte{}
|
||||
}
|
||||
iNdEx = postIndex
|
||||
case 2:
|
||||
if wireType != 2 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
|
||||
}
|
||||
var byteLen int
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
byteLen |= (int(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
if byteLen < 0 {
|
||||
return ErrInvalidLengthBackup
|
||||
}
|
||||
postIndex := iNdEx + byteLen
|
||||
if postIndex > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
m.Value = append(m.Value[:0], dAtA[iNdEx:postIndex]...)
|
||||
if m.Value == nil {
|
||||
m.Value = []byte{}
|
||||
}
|
||||
iNdEx = postIndex
|
||||
case 3:
|
||||
if wireType != 2 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field UserMeta", wireType)
|
||||
}
|
||||
var byteLen int
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
byteLen |= (int(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
if byteLen < 0 {
|
||||
return ErrInvalidLengthBackup
|
||||
}
|
||||
postIndex := iNdEx + byteLen
|
||||
if postIndex > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
m.UserMeta = append(m.UserMeta[:0], dAtA[iNdEx:postIndex]...)
|
||||
if m.UserMeta == nil {
|
||||
m.UserMeta = []byte{}
|
||||
}
|
||||
iNdEx = postIndex
|
||||
case 4:
|
||||
if wireType != 0 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field Version", wireType)
|
||||
}
|
||||
m.Version = 0
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
m.Version |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
case 5:
|
||||
if wireType != 0 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field ExpiresAt", wireType)
|
||||
}
|
||||
m.ExpiresAt = 0
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
m.ExpiresAt |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
default:
|
||||
iNdEx = preIndex
|
||||
skippy, err := skipBackup(dAtA[iNdEx:])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if skippy < 0 {
|
||||
return ErrInvalidLengthBackup
|
||||
}
|
||||
if (iNdEx + skippy) > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
iNdEx += skippy
|
||||
}
|
||||
}
|
||||
|
||||
if iNdEx > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
return nil
|
||||
}
|
||||
func skipBackup(dAtA []byte) (n int, err error) {
|
||||
l := len(dAtA)
|
||||
iNdEx := 0
|
||||
for iNdEx < l {
|
||||
var wire uint64
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
wire |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
wireType := int(wire & 0x7)
|
||||
switch wireType {
|
||||
case 0:
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
iNdEx++
|
||||
if dAtA[iNdEx-1] < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return iNdEx, nil
|
||||
case 1:
|
||||
iNdEx += 8
|
||||
return iNdEx, nil
|
||||
case 2:
|
||||
var length int
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
length |= (int(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
iNdEx += length
|
||||
if length < 0 {
|
||||
return 0, ErrInvalidLengthBackup
|
||||
}
|
||||
return iNdEx, nil
|
||||
case 3:
|
||||
for {
|
||||
var innerWire uint64
|
||||
var start int = iNdEx
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowBackup
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
innerWire |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
innerWireType := int(innerWire & 0x7)
|
||||
if innerWireType == 4 {
|
||||
break
|
||||
}
|
||||
next, err := skipBackup(dAtA[start:])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
iNdEx = start + next
|
||||
}
|
||||
return iNdEx, nil
|
||||
case 4:
|
||||
return iNdEx, nil
|
||||
case 5:
|
||||
iNdEx += 4
|
||||
return iNdEx, nil
|
||||
default:
|
||||
return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
|
||||
}
|
||||
}
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
var (
|
||||
ErrInvalidLengthBackup = fmt.Errorf("proto: negative length found during unmarshaling")
|
||||
ErrIntOverflowBackup = fmt.Errorf("proto: integer overflow")
|
||||
)
|
||||
|
||||
func init() { proto.RegisterFile("backup.proto", fileDescriptorBackup) }
|
||||
|
||||
var fileDescriptorBackup = []byte{
|
||||
// 167 bytes of a gzipped FileDescriptorProto
|
||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x49, 0x4a, 0x4c, 0xce,
|
||||
0x2e, 0x2d, 0xd0, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x62, 0x03, 0x53, 0xc5, 0x4a, 0xad, 0x8c,
|
||||
0x5c, 0x6c, 0xde, 0x61, 0x01, 0x89, 0x99, 0x45, 0x42, 0x02, 0x5c, 0xcc, 0xd9, 0xa9, 0x95, 0x12,
|
||||
0x8c, 0x0a, 0x8c, 0x1a, 0x3c, 0x41, 0x20, 0xa6, 0x90, 0x08, 0x17, 0x6b, 0x59, 0x62, 0x4e, 0x69,
|
||||
0xaa, 0x04, 0x13, 0x58, 0x0c, 0xc2, 0x11, 0x92, 0xe2, 0xe2, 0x28, 0x2d, 0x4e, 0x2d, 0xf2, 0x4d,
|
||||
0x2d, 0x49, 0x94, 0x60, 0x06, 0x4b, 0xc0, 0xf9, 0x42, 0x12, 0x5c, 0xec, 0x65, 0xa9, 0x45, 0xc5,
|
||||
0x99, 0xf9, 0x79, 0x12, 0x2c, 0x0a, 0x8c, 0x1a, 0x2c, 0x41, 0x30, 0xae, 0x90, 0x2c, 0x17, 0x57,
|
||||
0x6a, 0x45, 0x41, 0x66, 0x51, 0x6a, 0x71, 0x7c, 0x62, 0x89, 0x04, 0x2b, 0x58, 0x92, 0x13, 0x2a,
|
||||
0xe2, 0x58, 0xe2, 0x24, 0x70, 0xe2, 0x91, 0x1c, 0xe3, 0x85, 0x47, 0x72, 0x8c, 0x0f, 0x1e, 0xc9,
|
||||
0x31, 0xce, 0x78, 0x2c, 0xc7, 0x90, 0x04, 0x71, 0xa1, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0xe7,
|
||||
0x3f, 0x3f, 0x95, 0xb8, 0x00, 0x00, 0x00,
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Copyright (C) 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Use protos/gen.sh to generate .pb.go files.
|
||||
syntax = "proto3";
|
||||
|
||||
package protos;
|
||||
|
||||
message KVPair {
|
||||
bytes key = 1;
|
||||
bytes value = 2;
|
||||
bytes userMeta = 3;
|
||||
uint64 version = 4;
|
||||
uint64 expires_at = 5;
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
# You might need to go get -v github.com/gogo/protobuf/...
|
||||
|
||||
protos=${GOPATH-$HOME/go}/src/github.com/dgraph-io/badger/protos
|
||||
pushd $protos > /dev/null
|
||||
protoc --gofast_out=plugins=grpc:. -I=. *.proto
|
|
@ -0,0 +1,534 @@
|
|||
// Code generated by protoc-gen-gogo. DO NOT EDIT.
|
||||
// source: manifest.proto
|
||||
|
||||
package protos
|
||||
|
||||
import proto "github.com/golang/protobuf/proto"
|
||||
import fmt "fmt"
|
||||
import math "math"
|
||||
|
||||
import io "io"
|
||||
|
||||
// Reference imports to suppress errors if they are not otherwise used.
|
||||
var _ = proto.Marshal
|
||||
var _ = fmt.Errorf
|
||||
var _ = math.Inf
|
||||
|
||||
type ManifestChange_Operation int32
|
||||
|
||||
const (
|
||||
ManifestChange_CREATE ManifestChange_Operation = 0
|
||||
ManifestChange_DELETE ManifestChange_Operation = 1
|
||||
)
|
||||
|
||||
var ManifestChange_Operation_name = map[int32]string{
|
||||
0: "CREATE",
|
||||
1: "DELETE",
|
||||
}
|
||||
var ManifestChange_Operation_value = map[string]int32{
|
||||
"CREATE": 0,
|
||||
"DELETE": 1,
|
||||
}
|
||||
|
||||
func (x ManifestChange_Operation) String() string {
|
||||
return proto.EnumName(ManifestChange_Operation_name, int32(x))
|
||||
}
|
||||
func (ManifestChange_Operation) EnumDescriptor() ([]byte, []int) {
|
||||
return fileDescriptorManifest, []int{1, 0}
|
||||
}
|
||||
|
||||
type ManifestChangeSet struct {
|
||||
// A set of changes that are applied atomically.
|
||||
Changes []*ManifestChange `protobuf:"bytes,1,rep,name=changes" json:"changes,omitempty"`
|
||||
}
|
||||
|
||||
func (m *ManifestChangeSet) Reset() { *m = ManifestChangeSet{} }
|
||||
func (m *ManifestChangeSet) String() string { return proto.CompactTextString(m) }
|
||||
func (*ManifestChangeSet) ProtoMessage() {}
|
||||
func (*ManifestChangeSet) Descriptor() ([]byte, []int) { return fileDescriptorManifest, []int{0} }
|
||||
|
||||
func (m *ManifestChangeSet) GetChanges() []*ManifestChange {
|
||||
if m != nil {
|
||||
return m.Changes
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type ManifestChange struct {
|
||||
Id uint64 `protobuf:"varint,1,opt,name=Id,proto3" json:"Id,omitempty"`
|
||||
Op ManifestChange_Operation `protobuf:"varint,2,opt,name=Op,proto3,enum=protos.ManifestChange_Operation" json:"Op,omitempty"`
|
||||
Level uint32 `protobuf:"varint,3,opt,name=Level,proto3" json:"Level,omitempty"`
|
||||
}
|
||||
|
||||
func (m *ManifestChange) Reset() { *m = ManifestChange{} }
|
||||
func (m *ManifestChange) String() string { return proto.CompactTextString(m) }
|
||||
func (*ManifestChange) ProtoMessage() {}
|
||||
func (*ManifestChange) Descriptor() ([]byte, []int) { return fileDescriptorManifest, []int{1} }
|
||||
|
||||
func (m *ManifestChange) GetId() uint64 {
|
||||
if m != nil {
|
||||
return m.Id
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (m *ManifestChange) GetOp() ManifestChange_Operation {
|
||||
if m != nil {
|
||||
return m.Op
|
||||
}
|
||||
return ManifestChange_CREATE
|
||||
}
|
||||
|
||||
func (m *ManifestChange) GetLevel() uint32 {
|
||||
if m != nil {
|
||||
return m.Level
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func init() {
|
||||
proto.RegisterType((*ManifestChangeSet)(nil), "protos.ManifestChangeSet")
|
||||
proto.RegisterType((*ManifestChange)(nil), "protos.ManifestChange")
|
||||
proto.RegisterEnum("protos.ManifestChange_Operation", ManifestChange_Operation_name, ManifestChange_Operation_value)
|
||||
}
|
||||
func (m *ManifestChangeSet) Marshal() (dAtA []byte, err error) {
|
||||
size := m.Size()
|
||||
dAtA = make([]byte, size)
|
||||
n, err := m.MarshalTo(dAtA)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return dAtA[:n], nil
|
||||
}
|
||||
|
||||
func (m *ManifestChangeSet) MarshalTo(dAtA []byte) (int, error) {
|
||||
var i int
|
||||
_ = i
|
||||
var l int
|
||||
_ = l
|
||||
if len(m.Changes) > 0 {
|
||||
for _, msg := range m.Changes {
|
||||
dAtA[i] = 0xa
|
||||
i++
|
||||
i = encodeVarintManifest(dAtA, i, uint64(msg.Size()))
|
||||
n, err := msg.MarshalTo(dAtA[i:])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
i += n
|
||||
}
|
||||
}
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func (m *ManifestChange) Marshal() (dAtA []byte, err error) {
|
||||
size := m.Size()
|
||||
dAtA = make([]byte, size)
|
||||
n, err := m.MarshalTo(dAtA)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return dAtA[:n], nil
|
||||
}
|
||||
|
||||
func (m *ManifestChange) MarshalTo(dAtA []byte) (int, error) {
|
||||
var i int
|
||||
_ = i
|
||||
var l int
|
||||
_ = l
|
||||
if m.Id != 0 {
|
||||
dAtA[i] = 0x8
|
||||
i++
|
||||
i = encodeVarintManifest(dAtA, i, uint64(m.Id))
|
||||
}
|
||||
if m.Op != 0 {
|
||||
dAtA[i] = 0x10
|
||||
i++
|
||||
i = encodeVarintManifest(dAtA, i, uint64(m.Op))
|
||||
}
|
||||
if m.Level != 0 {
|
||||
dAtA[i] = 0x18
|
||||
i++
|
||||
i = encodeVarintManifest(dAtA, i, uint64(m.Level))
|
||||
}
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func encodeFixed64Manifest(dAtA []byte, offset int, v uint64) int {
|
||||
dAtA[offset] = uint8(v)
|
||||
dAtA[offset+1] = uint8(v >> 8)
|
||||
dAtA[offset+2] = uint8(v >> 16)
|
||||
dAtA[offset+3] = uint8(v >> 24)
|
||||
dAtA[offset+4] = uint8(v >> 32)
|
||||
dAtA[offset+5] = uint8(v >> 40)
|
||||
dAtA[offset+6] = uint8(v >> 48)
|
||||
dAtA[offset+7] = uint8(v >> 56)
|
||||
return offset + 8
|
||||
}
|
||||
func encodeFixed32Manifest(dAtA []byte, offset int, v uint32) int {
|
||||
dAtA[offset] = uint8(v)
|
||||
dAtA[offset+1] = uint8(v >> 8)
|
||||
dAtA[offset+2] = uint8(v >> 16)
|
||||
dAtA[offset+3] = uint8(v >> 24)
|
||||
return offset + 4
|
||||
}
|
||||
func encodeVarintManifest(dAtA []byte, offset int, v uint64) int {
|
||||
for v >= 1<<7 {
|
||||
dAtA[offset] = uint8(v&0x7f | 0x80)
|
||||
v >>= 7
|
||||
offset++
|
||||
}
|
||||
dAtA[offset] = uint8(v)
|
||||
return offset + 1
|
||||
}
|
||||
func (m *ManifestChangeSet) Size() (n int) {
|
||||
var l int
|
||||
_ = l
|
||||
if len(m.Changes) > 0 {
|
||||
for _, e := range m.Changes {
|
||||
l = e.Size()
|
||||
n += 1 + l + sovManifest(uint64(l))
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func (m *ManifestChange) Size() (n int) {
|
||||
var l int
|
||||
_ = l
|
||||
if m.Id != 0 {
|
||||
n += 1 + sovManifest(uint64(m.Id))
|
||||
}
|
||||
if m.Op != 0 {
|
||||
n += 1 + sovManifest(uint64(m.Op))
|
||||
}
|
||||
if m.Level != 0 {
|
||||
n += 1 + sovManifest(uint64(m.Level))
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func sovManifest(x uint64) (n int) {
|
||||
for {
|
||||
n++
|
||||
x >>= 7
|
||||
if x == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
func sozManifest(x uint64) (n int) {
|
||||
return sovManifest(uint64((x << 1) ^ uint64((int64(x) >> 63))))
|
||||
}
|
||||
func (m *ManifestChangeSet) Unmarshal(dAtA []byte) error {
|
||||
l := len(dAtA)
|
||||
iNdEx := 0
|
||||
for iNdEx < l {
|
||||
preIndex := iNdEx
|
||||
var wire uint64
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
wire |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
fieldNum := int32(wire >> 3)
|
||||
wireType := int(wire & 0x7)
|
||||
if wireType == 4 {
|
||||
return fmt.Errorf("proto: ManifestChangeSet: wiretype end group for non-group")
|
||||
}
|
||||
if fieldNum <= 0 {
|
||||
return fmt.Errorf("proto: ManifestChangeSet: illegal tag %d (wire type %d)", fieldNum, wire)
|
||||
}
|
||||
switch fieldNum {
|
||||
case 1:
|
||||
if wireType != 2 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field Changes", wireType)
|
||||
}
|
||||
var msglen int
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
msglen |= (int(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
if msglen < 0 {
|
||||
return ErrInvalidLengthManifest
|
||||
}
|
||||
postIndex := iNdEx + msglen
|
||||
if postIndex > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
m.Changes = append(m.Changes, &ManifestChange{})
|
||||
if err := m.Changes[len(m.Changes)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
|
||||
return err
|
||||
}
|
||||
iNdEx = postIndex
|
||||
default:
|
||||
iNdEx = preIndex
|
||||
skippy, err := skipManifest(dAtA[iNdEx:])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if skippy < 0 {
|
||||
return ErrInvalidLengthManifest
|
||||
}
|
||||
if (iNdEx + skippy) > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
iNdEx += skippy
|
||||
}
|
||||
}
|
||||
|
||||
if iNdEx > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
return nil
|
||||
}
|
||||
func (m *ManifestChange) Unmarshal(dAtA []byte) error {
|
||||
l := len(dAtA)
|
||||
iNdEx := 0
|
||||
for iNdEx < l {
|
||||
preIndex := iNdEx
|
||||
var wire uint64
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
wire |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
fieldNum := int32(wire >> 3)
|
||||
wireType := int(wire & 0x7)
|
||||
if wireType == 4 {
|
||||
return fmt.Errorf("proto: ManifestChange: wiretype end group for non-group")
|
||||
}
|
||||
if fieldNum <= 0 {
|
||||
return fmt.Errorf("proto: ManifestChange: illegal tag %d (wire type %d)", fieldNum, wire)
|
||||
}
|
||||
switch fieldNum {
|
||||
case 1:
|
||||
if wireType != 0 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field Id", wireType)
|
||||
}
|
||||
m.Id = 0
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
m.Id |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
case 2:
|
||||
if wireType != 0 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field Op", wireType)
|
||||
}
|
||||
m.Op = 0
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
m.Op |= (ManifestChange_Operation(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
case 3:
|
||||
if wireType != 0 {
|
||||
return fmt.Errorf("proto: wrong wireType = %d for field Level", wireType)
|
||||
}
|
||||
m.Level = 0
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
m.Level |= (uint32(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
default:
|
||||
iNdEx = preIndex
|
||||
skippy, err := skipManifest(dAtA[iNdEx:])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if skippy < 0 {
|
||||
return ErrInvalidLengthManifest
|
||||
}
|
||||
if (iNdEx + skippy) > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
iNdEx += skippy
|
||||
}
|
||||
}
|
||||
|
||||
if iNdEx > l {
|
||||
return io.ErrUnexpectedEOF
|
||||
}
|
||||
return nil
|
||||
}
|
||||
func skipManifest(dAtA []byte) (n int, err error) {
|
||||
l := len(dAtA)
|
||||
iNdEx := 0
|
||||
for iNdEx < l {
|
||||
var wire uint64
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
wire |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
wireType := int(wire & 0x7)
|
||||
switch wireType {
|
||||
case 0:
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
iNdEx++
|
||||
if dAtA[iNdEx-1] < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return iNdEx, nil
|
||||
case 1:
|
||||
iNdEx += 8
|
||||
return iNdEx, nil
|
||||
case 2:
|
||||
var length int
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
length |= (int(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
iNdEx += length
|
||||
if length < 0 {
|
||||
return 0, ErrInvalidLengthManifest
|
||||
}
|
||||
return iNdEx, nil
|
||||
case 3:
|
||||
for {
|
||||
var innerWire uint64
|
||||
var start int = iNdEx
|
||||
for shift := uint(0); ; shift += 7 {
|
||||
if shift >= 64 {
|
||||
return 0, ErrIntOverflowManifest
|
||||
}
|
||||
if iNdEx >= l {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
b := dAtA[iNdEx]
|
||||
iNdEx++
|
||||
innerWire |= (uint64(b) & 0x7F) << shift
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
}
|
||||
innerWireType := int(innerWire & 0x7)
|
||||
if innerWireType == 4 {
|
||||
break
|
||||
}
|
||||
next, err := skipManifest(dAtA[start:])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
iNdEx = start + next
|
||||
}
|
||||
return iNdEx, nil
|
||||
case 4:
|
||||
return iNdEx, nil
|
||||
case 5:
|
||||
iNdEx += 4
|
||||
return iNdEx, nil
|
||||
default:
|
||||
return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
|
||||
}
|
||||
}
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
var (
|
||||
ErrInvalidLengthManifest = fmt.Errorf("proto: negative length found during unmarshaling")
|
||||
ErrIntOverflowManifest = fmt.Errorf("proto: integer overflow")
|
||||
)
|
||||
|
||||
func init() { proto.RegisterFile("manifest.proto", fileDescriptorManifest) }
|
||||
|
||||
var fileDescriptorManifest = []byte{
|
||||
// 208 bytes of a gzipped FileDescriptorProto
|
||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0xcb, 0x4d, 0xcc, 0xcb,
|
||||
0x4c, 0x4b, 0x2d, 0x2e, 0xd1, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x62, 0x03, 0x53, 0xc5, 0x4a,
|
||||
0xae, 0x5c, 0x82, 0xbe, 0x50, 0x19, 0xe7, 0x8c, 0xc4, 0xbc, 0xf4, 0xd4, 0xe0, 0xd4, 0x12, 0x21,
|
||||
0x03, 0x2e, 0xf6, 0x64, 0x30, 0xa7, 0x58, 0x82, 0x51, 0x81, 0x59, 0x83, 0xdb, 0x48, 0x0c, 0xa2,
|
||||
0xab, 0x58, 0x0f, 0x55, 0x6d, 0x10, 0x4c, 0x99, 0x52, 0x2f, 0x23, 0x17, 0x1f, 0xaa, 0x9c, 0x10,
|
||||
0x1f, 0x17, 0x93, 0x67, 0x8a, 0x04, 0xa3, 0x02, 0xa3, 0x06, 0x4b, 0x10, 0x93, 0x67, 0x8a, 0x90,
|
||||
0x01, 0x17, 0x93, 0x7f, 0x81, 0x04, 0x93, 0x02, 0xa3, 0x06, 0x9f, 0x91, 0x02, 0x76, 0xf3, 0xf4,
|
||||
0xfc, 0x0b, 0x52, 0x8b, 0x12, 0x4b, 0x32, 0xf3, 0xf3, 0x82, 0x98, 0xfc, 0x0b, 0x84, 0x44, 0xb8,
|
||||
0x58, 0x7d, 0x52, 0xcb, 0x52, 0x73, 0x24, 0x98, 0x15, 0x18, 0x35, 0x78, 0x83, 0x20, 0x1c, 0x25,
|
||||
0x65, 0x2e, 0x4e, 0xb8, 0x32, 0x21, 0x2e, 0x2e, 0x36, 0xe7, 0x20, 0x57, 0xc7, 0x10, 0x57, 0x01,
|
||||
0x06, 0x10, 0xdb, 0xc5, 0xd5, 0xc7, 0x35, 0xc4, 0x55, 0x80, 0xd1, 0x49, 0xe0, 0xc4, 0x23, 0x39,
|
||||
0xc6, 0x0b, 0x8f, 0xe4, 0x18, 0x1f, 0x3c, 0x92, 0x63, 0x9c, 0xf1, 0x58, 0x8e, 0x21, 0x09, 0xe2,
|
||||
0x61, 0x63, 0x40, 0x00, 0x00, 0x00, 0xff, 0xff, 0x42, 0x6f, 0x23, 0xc9, 0x09, 0x01, 0x00, 0x00,
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright (C) 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Use protos/gen.sh to generate .pb.go files.
|
||||
syntax = "proto3";
|
||||
|
||||
package protos;
|
||||
|
||||
message ManifestChangeSet {
|
||||
// A set of changes that are applied atomically.
|
||||
repeated ManifestChange changes = 1;
|
||||
}
|
||||
|
||||
message ManifestChange {
|
||||
uint64 Id = 1;
|
||||
enum Operation {
|
||||
CREATE = 0;
|
||||
DELETE = 1;
|
||||
}
|
||||
Operation Op = 2;
|
||||
uint32 Level = 3; // Only used for CREATE
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
This is much better than `skiplist` and `slist`.
|
||||
|
||||
```
|
||||
BenchmarkReadWrite/frac_0-8 3000000 537 ns/op
|
||||
BenchmarkReadWrite/frac_1-8 3000000 503 ns/op
|
||||
BenchmarkReadWrite/frac_2-8 3000000 492 ns/op
|
||||
BenchmarkReadWrite/frac_3-8 3000000 475 ns/op
|
||||
BenchmarkReadWrite/frac_4-8 3000000 440 ns/op
|
||||
BenchmarkReadWrite/frac_5-8 5000000 442 ns/op
|
||||
BenchmarkReadWrite/frac_6-8 5000000 380 ns/op
|
||||
BenchmarkReadWrite/frac_7-8 5000000 338 ns/op
|
||||
BenchmarkReadWrite/frac_8-8 5000000 294 ns/op
|
||||
BenchmarkReadWrite/frac_9-8 10000000 268 ns/op
|
||||
BenchmarkReadWrite/frac_10-8 100000000 26.3 ns/op
|
||||
```
|
||||
|
||||
And even better than a simple map with read-write lock:
|
||||
|
||||
```
|
||||
BenchmarkReadWriteMap/frac_0-8 2000000 774 ns/op
|
||||
BenchmarkReadWriteMap/frac_1-8 2000000 647 ns/op
|
||||
BenchmarkReadWriteMap/frac_2-8 3000000 605 ns/op
|
||||
BenchmarkReadWriteMap/frac_3-8 3000000 603 ns/op
|
||||
BenchmarkReadWriteMap/frac_4-8 3000000 556 ns/op
|
||||
BenchmarkReadWriteMap/frac_5-8 3000000 472 ns/op
|
||||
BenchmarkReadWriteMap/frac_6-8 3000000 476 ns/op
|
||||
BenchmarkReadWriteMap/frac_7-8 3000000 457 ns/op
|
||||
BenchmarkReadWriteMap/frac_8-8 5000000 444 ns/op
|
||||
BenchmarkReadWriteMap/frac_9-8 5000000 361 ns/op
|
||||
BenchmarkReadWriteMap/frac_10-8 10000000 212 ns/op
|
||||
```
|
||||
|
||||
# Node Pooling
|
||||
|
||||
Command used
|
||||
|
||||
```
|
||||
rm -Rf tmp && /usr/bin/time -l ./populate -keys_mil 10
|
||||
```
|
||||
|
||||
For pprof results, we run without using /usr/bin/time. There are four runs below.
|
||||
|
||||
Results seem to vary quite a bit between runs.
|
||||
|
||||
## Before node pooling
|
||||
|
||||
```
|
||||
1311.53MB of 1338.69MB total (97.97%)
|
||||
Dropped 30 nodes (cum <= 6.69MB)
|
||||
Showing top 10 nodes out of 37 (cum >= 12.50MB)
|
||||
flat flat% sum% cum cum%
|
||||
523.04MB 39.07% 39.07% 523.04MB 39.07% github.com/dgraph-io/badger/skl.(*Skiplist).Put
|
||||
184.51MB 13.78% 52.85% 184.51MB 13.78% runtime.stringtoslicebyte
|
||||
166.01MB 12.40% 65.25% 689.04MB 51.47% github.com/dgraph-io/badger/mem.(*Table).Put
|
||||
165MB 12.33% 77.58% 165MB 12.33% runtime.convT2E
|
||||
116.92MB 8.73% 86.31% 116.92MB 8.73% bytes.makeSlice
|
||||
62.50MB 4.67% 90.98% 62.50MB 4.67% main.newValue
|
||||
34.50MB 2.58% 93.56% 34.50MB 2.58% github.com/dgraph-io/badger/table.(*BlockIterator).parseKV
|
||||
25.50MB 1.90% 95.46% 100.06MB 7.47% github.com/dgraph-io/badger/y.(*MergeIterator).Next
|
||||
21.06MB 1.57% 97.04% 21.06MB 1.57% github.com/dgraph-io/badger/table.(*Table).read
|
||||
12.50MB 0.93% 97.97% 12.50MB 0.93% github.com/dgraph-io/badger/table.header.Encode
|
||||
|
||||
128.31 real 329.37 user 17.11 sys
|
||||
3355660288 maximum resident set size
|
||||
0 average shared memory size
|
||||
0 average unshared data size
|
||||
0 average unshared stack size
|
||||
2203080 page reclaims
|
||||
764 page faults
|
||||
0 swaps
|
||||
275 block input operations
|
||||
76 block output operations
|
||||
0 messages sent
|
||||
0 messages received
|
||||
0 signals received
|
||||
49173 voluntary context switches
|
||||
599922 involuntary context switches
|
||||
```
|
||||
|
||||
## After node pooling
|
||||
|
||||
```
|
||||
1963.13MB of 2026.09MB total (96.89%)
|
||||
Dropped 29 nodes (cum <= 10.13MB)
|
||||
Showing top 10 nodes out of 41 (cum >= 185.62MB)
|
||||
flat flat% sum% cum cum%
|
||||
658.05MB 32.48% 32.48% 658.05MB 32.48% github.com/dgraph-io/badger/skl.glob..func1
|
||||
297.51MB 14.68% 47.16% 297.51MB 14.68% runtime.convT2E
|
||||
257.51MB 12.71% 59.87% 257.51MB 12.71% runtime.stringtoslicebyte
|
||||
249.01MB 12.29% 72.16% 1007.06MB 49.70% github.com/dgraph-io/badger/mem.(*Table).Put
|
||||
142.43MB 7.03% 79.19% 142.43MB 7.03% bytes.makeSlice
|
||||
100MB 4.94% 84.13% 758.05MB 37.41% github.com/dgraph-io/badger/skl.newNode
|
||||
99.50MB 4.91% 89.04% 99.50MB 4.91% main.newValue
|
||||
75MB 3.70% 92.74% 75MB 3.70% github.com/dgraph-io/badger/table.(*BlockIterator).parseKV
|
||||
44.62MB 2.20% 94.94% 44.62MB 2.20% github.com/dgraph-io/badger/table.(*Table).read
|
||||
39.50MB 1.95% 96.89% 185.62MB 9.16% github.com/dgraph-io/badger/y.(*MergeIterator).Next
|
||||
|
||||
135.58 real 374.29 user 17.65 sys
|
||||
3740614656 maximum resident set size
|
||||
0 average shared memory size
|
||||
0 average unshared data size
|
||||
0 average unshared stack size
|
||||
2276566 page reclaims
|
||||
770 page faults
|
||||
0 swaps
|
||||
128 block input operations
|
||||
90 block output operations
|
||||
0 messages sent
|
||||
0 messages received
|
||||
0 signals received
|
||||
46434 voluntary context switches
|
||||
597049 involuntary context switches
|
||||
```
|
|
@ -0,0 +1,131 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package skl
|
||||
|
||||
import (
|
||||
"sync/atomic"
|
||||
"unsafe"
|
||||
|
||||
"github.com/dgraph-io/badger/y"
|
||||
)
|
||||
|
||||
const (
|
||||
offsetSize = int(unsafe.Sizeof(uint32(0)))
|
||||
ptrAlign = int(unsafe.Sizeof(uintptr(0))) - 1
|
||||
)
|
||||
|
||||
// Arena should be lock-free.
|
||||
type Arena struct {
|
||||
n uint32
|
||||
buf []byte
|
||||
}
|
||||
|
||||
// newArena returns a new arena.
|
||||
func newArena(n int64) *Arena {
|
||||
// Don't store data at position 0 in order to reserve offset=0 as a kind
|
||||
// of nil pointer.
|
||||
out := &Arena{
|
||||
n: 1,
|
||||
buf: make([]byte, n),
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *Arena) size() int64 {
|
||||
return int64(atomic.LoadUint32(&s.n))
|
||||
}
|
||||
|
||||
func (s *Arena) reset() {
|
||||
atomic.StoreUint32(&s.n, 0)
|
||||
}
|
||||
|
||||
// putNode allocates a node in the arena. The node is aligned on a pointer-sized
|
||||
// boundary. The arena offset of the node is returned.
|
||||
func (s *Arena) putNode(height int) uint32 {
|
||||
// Compute the amount of the tower that will never be used, since the height
|
||||
// is less than maxHeight.
|
||||
unusedSize := (maxHeight - height) * offsetSize
|
||||
|
||||
// Pad the allocation with enough bytes to ensure pointer alignment.
|
||||
l := uint32(MaxNodeSize - unusedSize + ptrAlign)
|
||||
n := atomic.AddUint32(&s.n, l)
|
||||
y.AssertTruef(int(n) <= len(s.buf),
|
||||
"Arena too small, toWrite:%d newTotal:%d limit:%d",
|
||||
l, n, len(s.buf))
|
||||
|
||||
// Return the aligned offset.
|
||||
m := (n - l + uint32(ptrAlign)) & ^uint32(ptrAlign)
|
||||
return m
|
||||
}
|
||||
|
||||
// Put will *copy* val into arena. To make better use of this, reuse your input
|
||||
// val buffer. Returns an offset into buf. User is responsible for remembering
|
||||
// size of val. We could also store this size inside arena but the encoding and
|
||||
// decoding will incur some overhead.
|
||||
func (s *Arena) putVal(v y.ValueStruct) uint32 {
|
||||
l := uint32(v.EncodedSize())
|
||||
n := atomic.AddUint32(&s.n, l)
|
||||
y.AssertTruef(int(n) <= len(s.buf),
|
||||
"Arena too small, toWrite:%d newTotal:%d limit:%d",
|
||||
l, n, len(s.buf))
|
||||
m := n - l
|
||||
v.Encode(s.buf[m:])
|
||||
return m
|
||||
}
|
||||
|
||||
func (s *Arena) putKey(key []byte) uint32 {
|
||||
l := uint32(len(key))
|
||||
n := atomic.AddUint32(&s.n, l)
|
||||
y.AssertTruef(int(n) <= len(s.buf),
|
||||
"Arena too small, toWrite:%d newTotal:%d limit:%d",
|
||||
l, n, len(s.buf))
|
||||
m := n - l
|
||||
y.AssertTrue(len(key) == copy(s.buf[m:n], key))
|
||||
return m
|
||||
}
|
||||
|
||||
// getNode returns a pointer to the node located at offset. If the offset is
|
||||
// zero, then the nil node pointer is returned.
|
||||
func (s *Arena) getNode(offset uint32) *node {
|
||||
if offset == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return (*node)(unsafe.Pointer(&s.buf[offset]))
|
||||
}
|
||||
|
||||
// getKey returns byte slice at offset.
|
||||
func (s *Arena) getKey(offset uint32, size uint16) []byte {
|
||||
return s.buf[offset : offset+uint32(size)]
|
||||
}
|
||||
|
||||
// getVal returns byte slice at offset. The given size should be just the value
|
||||
// size and should NOT include the meta bytes.
|
||||
func (s *Arena) getVal(offset uint32, size uint16) (ret y.ValueStruct) {
|
||||
ret.Decode(s.buf[offset : offset+uint32(size)])
|
||||
return
|
||||
}
|
||||
|
||||
// getNodeOffset returns the offset of node in the arena. If the node pointer is
|
||||
// nil, then the zero offset is returned.
|
||||
func (s *Arena) getNodeOffset(nd *node) uint32 {
|
||||
if nd == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return uint32(uintptr(unsafe.Pointer(nd)) - uintptr(unsafe.Pointer(&s.buf[0])))
|
||||
}
|
|
@ -0,0 +1,524 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
Adapted from RocksDB inline skiplist.
|
||||
|
||||
Key differences:
|
||||
- No optimization for sequential inserts (no "prev").
|
||||
- No custom comparator.
|
||||
- Support overwrites. This requires care when we see the same key when inserting.
|
||||
For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
|
||||
there is no need for values. We don't intend to support versioning. In-place updates of values
|
||||
would be more efficient.
|
||||
- We discard all non-concurrent code.
|
||||
- We do not support Splices. This simplifies the code a lot.
|
||||
- No AllocateNode or other pointer arithmetic.
|
||||
- We combine the findLessThan, findGreaterOrEqual, etc into one function.
|
||||
*/
|
||||
|
||||
package skl
|
||||
|
||||
import (
|
||||
"math"
|
||||
"math/rand"
|
||||
"sync/atomic"
|
||||
"unsafe"
|
||||
|
||||
"github.com/dgraph-io/badger/y"
|
||||
)
|
||||
|
||||
const (
|
||||
maxHeight = 20
|
||||
heightIncrease = math.MaxUint32 / 3
|
||||
)
|
||||
|
||||
// MaxNodeSize is the memory footprint of a node of maximum height.
|
||||
const MaxNodeSize = int(unsafe.Sizeof(node{}))
|
||||
|
||||
type node struct {
|
||||
// Multiple parts of the value are encoded as a single uint64 so that it
|
||||
// can be atomically loaded and stored:
|
||||
// value offset: uint32 (bits 0-31)
|
||||
// value size : uint16 (bits 32-47)
|
||||
// 12 bytes are allocated to ensure 8 byte alignment also on 32bit systems.
|
||||
value [12]byte
|
||||
|
||||
// A byte slice is 24 bytes. We are trying to save space here.
|
||||
keyOffset uint32 // Immutable. No need to lock to access key.
|
||||
keySize uint16 // Immutable. No need to lock to access key.
|
||||
|
||||
// Height of the tower.
|
||||
height uint16
|
||||
|
||||
// Most nodes do not need to use the full height of the tower, since the
|
||||
// probability of each successive level decreases exponentially. Because
|
||||
// these elements are never accessed, they do not need to be allocated.
|
||||
// Therefore, when a node is allocated in the arena, its memory footprint
|
||||
// is deliberately truncated to not include unneeded tower elements.
|
||||
//
|
||||
// All accesses to elements should use CAS operations, with no need to lock.
|
||||
tower [maxHeight]uint32
|
||||
}
|
||||
|
||||
// Skiplist maps keys to values (in memory)
|
||||
type Skiplist struct {
|
||||
height int32 // Current height. 1 <= height <= kMaxHeight. CAS.
|
||||
head *node
|
||||
ref int32
|
||||
arena *Arena
|
||||
}
|
||||
|
||||
// IncrRef increases the refcount
|
||||
func (s *Skiplist) IncrRef() {
|
||||
atomic.AddInt32(&s.ref, 1)
|
||||
}
|
||||
|
||||
// DecrRef decrements the refcount, deallocating the Skiplist when done using it
|
||||
func (s *Skiplist) DecrRef() {
|
||||
newRef := atomic.AddInt32(&s.ref, -1)
|
||||
if newRef > 0 {
|
||||
return
|
||||
}
|
||||
|
||||
s.arena.reset()
|
||||
// Indicate we are closed. Good for testing. Also, lets GC reclaim memory. Race condition
|
||||
// here would suggest we are accessing skiplist when we are supposed to have no reference!
|
||||
s.arena = nil
|
||||
}
|
||||
|
||||
func (s *Skiplist) valid() bool { return s.arena != nil }
|
||||
|
||||
func newNode(arena *Arena, key []byte, v y.ValueStruct, height int) *node {
|
||||
// The base level is already allocated in the node struct.
|
||||
offset := arena.putNode(height)
|
||||
node := arena.getNode(offset)
|
||||
node.keyOffset = arena.putKey(key)
|
||||
node.keySize = uint16(len(key))
|
||||
node.height = uint16(height)
|
||||
*node.value64BitAlignedPtr() = encodeValue(arena.putVal(v), v.EncodedSize())
|
||||
return node
|
||||
}
|
||||
|
||||
func encodeValue(valOffset uint32, valSize uint16) uint64 {
|
||||
return uint64(valSize)<<32 | uint64(valOffset)
|
||||
}
|
||||
|
||||
func decodeValue(value uint64) (valOffset uint32, valSize uint16) {
|
||||
valOffset = uint32(value)
|
||||
valSize = uint16(value >> 32)
|
||||
return
|
||||
}
|
||||
|
||||
// NewSkiplist makes a new empty skiplist, with a given arena size
|
||||
func NewSkiplist(arenaSize int64) *Skiplist {
|
||||
arena := newArena(arenaSize)
|
||||
head := newNode(arena, nil, y.ValueStruct{}, maxHeight)
|
||||
return &Skiplist{
|
||||
height: 1,
|
||||
head: head,
|
||||
arena: arena,
|
||||
ref: 1,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *node) value64BitAlignedPtr() *uint64 {
|
||||
if uintptr(unsafe.Pointer(&s.value))%8 == 0 {
|
||||
return (*uint64)(unsafe.Pointer(&s.value))
|
||||
}
|
||||
return (*uint64)(unsafe.Pointer(&s.value[4]))
|
||||
}
|
||||
|
||||
func (s *node) getValueOffset() (uint32, uint16) {
|
||||
value := atomic.LoadUint64(s.value64BitAlignedPtr())
|
||||
return decodeValue(value)
|
||||
}
|
||||
|
||||
func (s *node) key(arena *Arena) []byte {
|
||||
return arena.getKey(s.keyOffset, s.keySize)
|
||||
}
|
||||
|
||||
func (s *node) setValue(arena *Arena, v y.ValueStruct) {
|
||||
valOffset := arena.putVal(v)
|
||||
value := encodeValue(valOffset, v.EncodedSize())
|
||||
atomic.StoreUint64(s.value64BitAlignedPtr(), value)
|
||||
}
|
||||
|
||||
func (s *node) getNextOffset(h int) uint32 {
|
||||
return atomic.LoadUint32(&s.tower[h])
|
||||
}
|
||||
|
||||
func (s *node) casNextOffset(h int, old, val uint32) bool {
|
||||
return atomic.CompareAndSwapUint32(&s.tower[h], old, val)
|
||||
}
|
||||
|
||||
// Returns true if key is strictly > n.key.
|
||||
// If n is nil, this is an "end" marker and we return false.
|
||||
//func (s *Skiplist) keyIsAfterNode(key []byte, n *node) bool {
|
||||
// y.AssertTrue(n != s.head)
|
||||
// return n != nil && y.CompareKeys(key, n.key) > 0
|
||||
//}
|
||||
|
||||
func randomHeight() int {
|
||||
h := 1
|
||||
for h < maxHeight && rand.Uint32() <= heightIncrease {
|
||||
h++
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
func (s *Skiplist) getNext(nd *node, height int) *node {
|
||||
return s.arena.getNode(nd.getNextOffset(height))
|
||||
}
|
||||
|
||||
// findNear finds the node near to key.
|
||||
// If less=true, it finds rightmost node such that node.key < key (if allowEqual=false) or
|
||||
// node.key <= key (if allowEqual=true).
|
||||
// If less=false, it finds leftmost node such that node.key > key (if allowEqual=false) or
|
||||
// node.key >= key (if allowEqual=true).
|
||||
// Returns the node found. The bool returned is true if the node has key equal to given key.
|
||||
func (s *Skiplist) findNear(key []byte, less bool, allowEqual bool) (*node, bool) {
|
||||
x := s.head
|
||||
level := int(s.getHeight() - 1)
|
||||
for {
|
||||
// Assume x.key < key.
|
||||
next := s.getNext(x, level)
|
||||
if next == nil {
|
||||
// x.key < key < END OF LIST
|
||||
if level > 0 {
|
||||
// Can descend further to iterate closer to the end.
|
||||
level--
|
||||
continue
|
||||
}
|
||||
// Level=0. Cannot descend further. Let's return something that makes sense.
|
||||
if !less {
|
||||
return nil, false
|
||||
}
|
||||
// Try to return x. Make sure it is not a head node.
|
||||
if x == s.head {
|
||||
return nil, false
|
||||
}
|
||||
return x, false
|
||||
}
|
||||
|
||||
nextKey := next.key(s.arena)
|
||||
cmp := y.CompareKeys(key, nextKey)
|
||||
if cmp > 0 {
|
||||
// x.key < next.key < key. We can continue to move right.
|
||||
x = next
|
||||
continue
|
||||
}
|
||||
if cmp == 0 {
|
||||
// x.key < key == next.key.
|
||||
if allowEqual {
|
||||
return next, true
|
||||
}
|
||||
if !less {
|
||||
// We want >, so go to base level to grab the next bigger note.
|
||||
return s.getNext(next, 0), false
|
||||
}
|
||||
// We want <. If not base level, we should go closer in the next level.
|
||||
if level > 0 {
|
||||
level--
|
||||
continue
|
||||
}
|
||||
// On base level. Return x.
|
||||
if x == s.head {
|
||||
return nil, false
|
||||
}
|
||||
return x, false
|
||||
}
|
||||
// cmp < 0. In other words, x.key < key < next.
|
||||
if level > 0 {
|
||||
level--
|
||||
continue
|
||||
}
|
||||
// At base level. Need to return something.
|
||||
if !less {
|
||||
return next, false
|
||||
}
|
||||
// Try to return x. Make sure it is not a head node.
|
||||
if x == s.head {
|
||||
return nil, false
|
||||
}
|
||||
return x, false
|
||||
}
|
||||
}
|
||||
|
||||
// findSpliceForLevel returns (outBefore, outAfter) with outBefore.key <= key <= outAfter.key.
|
||||
// The input "before" tells us where to start looking.
|
||||
// If we found a node with the same key, then we return outBefore = outAfter.
|
||||
// Otherwise, outBefore.key < key < outAfter.key.
|
||||
func (s *Skiplist) findSpliceForLevel(key []byte, before *node, level int) (*node, *node) {
|
||||
for {
|
||||
// Assume before.key < key.
|
||||
next := s.getNext(before, level)
|
||||
if next == nil {
|
||||
return before, next
|
||||
}
|
||||
nextKey := next.key(s.arena)
|
||||
cmp := y.CompareKeys(key, nextKey)
|
||||
if cmp == 0 {
|
||||
// Equality case.
|
||||
return next, next
|
||||
}
|
||||
if cmp < 0 {
|
||||
// before.key < key < next.key. We are done for this level.
|
||||
return before, next
|
||||
}
|
||||
before = next // Keep moving right on this level.
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Skiplist) getHeight() int32 {
|
||||
return atomic.LoadInt32(&s.height)
|
||||
}
|
||||
|
||||
// Put inserts the key-value pair.
|
||||
func (s *Skiplist) Put(key []byte, v y.ValueStruct) {
|
||||
// Since we allow overwrite, we may not need to create a new node. We might not even need to
|
||||
// increase the height. Let's defer these actions.
|
||||
|
||||
listHeight := s.getHeight()
|
||||
var prev [maxHeight + 1]*node
|
||||
var next [maxHeight + 1]*node
|
||||
prev[listHeight] = s.head
|
||||
next[listHeight] = nil
|
||||
for i := int(listHeight) - 1; i >= 0; i-- {
|
||||
// Use higher level to speed up for current level.
|
||||
prev[i], next[i] = s.findSpliceForLevel(key, prev[i+1], i)
|
||||
if prev[i] == next[i] {
|
||||
prev[i].setValue(s.arena, v)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// We do need to create a new node.
|
||||
height := randomHeight()
|
||||
x := newNode(s.arena, key, v, height)
|
||||
|
||||
// Try to increase s.height via CAS.
|
||||
listHeight = s.getHeight()
|
||||
for height > int(listHeight) {
|
||||
if atomic.CompareAndSwapInt32(&s.height, listHeight, int32(height)) {
|
||||
// Successfully increased skiplist.height.
|
||||
break
|
||||
}
|
||||
listHeight = s.getHeight()
|
||||
}
|
||||
|
||||
// We always insert from the base level and up. After you add a node in base level, we cannot
|
||||
// create a node in the level above because it would have discovered the node in the base level.
|
||||
for i := 0; i < height; i++ {
|
||||
for {
|
||||
if prev[i] == nil {
|
||||
y.AssertTrue(i > 1) // This cannot happen in base level.
|
||||
// We haven't computed prev, next for this level because height exceeds old listHeight.
|
||||
// For these levels, we expect the lists to be sparse, so we can just search from head.
|
||||
prev[i], next[i] = s.findSpliceForLevel(key, s.head, i)
|
||||
// Someone adds the exact same key before we are able to do so. This can only happen on
|
||||
// the base level. But we know we are not on the base level.
|
||||
y.AssertTrue(prev[i] != next[i])
|
||||
}
|
||||
nextOffset := s.arena.getNodeOffset(next[i])
|
||||
x.tower[i] = nextOffset
|
||||
if prev[i].casNextOffset(i, nextOffset, s.arena.getNodeOffset(x)) {
|
||||
// Managed to insert x between prev[i] and next[i]. Go to the next level.
|
||||
break
|
||||
}
|
||||
// CAS failed. We need to recompute prev and next.
|
||||
// It is unlikely to be helpful to try to use a different level as we redo the search,
|
||||
// because it is unlikely that lots of nodes are inserted between prev[i] and next[i].
|
||||
prev[i], next[i] = s.findSpliceForLevel(key, prev[i], i)
|
||||
if prev[i] == next[i] {
|
||||
y.AssertTruef(i == 0, "Equality can happen only on base level: %d", i)
|
||||
prev[i].setValue(s.arena, v)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Empty returns if the Skiplist is empty.
|
||||
func (s *Skiplist) Empty() bool {
|
||||
return s.findLast() == nil
|
||||
}
|
||||
|
||||
// findLast returns the last element. If head (empty list), we return nil. All the find functions
|
||||
// will NEVER return the head nodes.
|
||||
func (s *Skiplist) findLast() *node {
|
||||
n := s.head
|
||||
level := int(s.getHeight()) - 1
|
||||
for {
|
||||
next := s.getNext(n, level)
|
||||
if next != nil {
|
||||
n = next
|
||||
continue
|
||||
}
|
||||
if level == 0 {
|
||||
if n == s.head {
|
||||
return nil
|
||||
}
|
||||
return n
|
||||
}
|
||||
level--
|
||||
}
|
||||
}
|
||||
|
||||
// Get gets the value associated with the key. It returns a valid value if it finds equal or earlier
|
||||
// version of the same key.
|
||||
func (s *Skiplist) Get(key []byte) y.ValueStruct {
|
||||
n, _ := s.findNear(key, false, true) // findGreaterOrEqual.
|
||||
if n == nil {
|
||||
return y.ValueStruct{}
|
||||
}
|
||||
|
||||
nextKey := s.arena.getKey(n.keyOffset, n.keySize)
|
||||
if !y.SameKey(key, nextKey) {
|
||||
return y.ValueStruct{}
|
||||
}
|
||||
|
||||
valOffset, valSize := n.getValueOffset()
|
||||
vs := s.arena.getVal(valOffset, valSize)
|
||||
vs.Version = y.ParseTs(nextKey)
|
||||
return vs
|
||||
}
|
||||
|
||||
// NewIterator returns a skiplist iterator. You have to Close() the iterator.
|
||||
func (s *Skiplist) NewIterator() *Iterator {
|
||||
s.IncrRef()
|
||||
return &Iterator{list: s}
|
||||
}
|
||||
|
||||
// MemSize returns the size of the Skiplist in terms of how much memory is used within its internal
|
||||
// arena.
|
||||
func (s *Skiplist) MemSize() int64 { return s.arena.size() }
|
||||
|
||||
// Iterator is an iterator over skiplist object. For new objects, you just
|
||||
// need to initialize Iterator.list.
|
||||
type Iterator struct {
|
||||
list *Skiplist
|
||||
n *node
|
||||
}
|
||||
|
||||
// Close frees the resources held by the iterator
|
||||
func (s *Iterator) Close() error {
|
||||
s.list.DecrRef()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Valid returns true iff the iterator is positioned at a valid node.
|
||||
func (s *Iterator) Valid() bool { return s.n != nil }
|
||||
|
||||
// Key returns the key at the current position.
|
||||
func (s *Iterator) Key() []byte {
|
||||
return s.list.arena.getKey(s.n.keyOffset, s.n.keySize)
|
||||
}
|
||||
|
||||
// Value returns value.
|
||||
func (s *Iterator) Value() y.ValueStruct {
|
||||
valOffset, valSize := s.n.getValueOffset()
|
||||
return s.list.arena.getVal(valOffset, valSize)
|
||||
}
|
||||
|
||||
// Next advances to the next position.
|
||||
func (s *Iterator) Next() {
|
||||
y.AssertTrue(s.Valid())
|
||||
s.n = s.list.getNext(s.n, 0)
|
||||
}
|
||||
|
||||
// Prev advances to the previous position.
|
||||
func (s *Iterator) Prev() {
|
||||
y.AssertTrue(s.Valid())
|
||||
s.n, _ = s.list.findNear(s.Key(), true, false) // find <. No equality allowed.
|
||||
}
|
||||
|
||||
// Seek advances to the first entry with a key >= target.
|
||||
func (s *Iterator) Seek(target []byte) {
|
||||
s.n, _ = s.list.findNear(target, false, true) // find >=.
|
||||
}
|
||||
|
||||
// SeekForPrev finds an entry with key <= target.
|
||||
func (s *Iterator) SeekForPrev(target []byte) {
|
||||
s.n, _ = s.list.findNear(target, true, true) // find <=.
|
||||
}
|
||||
|
||||
// SeekToFirst seeks position at the first entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
func (s *Iterator) SeekToFirst() {
|
||||
s.n = s.list.getNext(s.list.head, 0)
|
||||
}
|
||||
|
||||
// SeekToLast seeks position at the last entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
func (s *Iterator) SeekToLast() {
|
||||
s.n = s.list.findLast()
|
||||
}
|
||||
|
||||
// UniIterator is a unidirectional memtable iterator. It is a thin wrapper around
|
||||
// Iterator. We like to keep Iterator as before, because it is more powerful and
|
||||
// we might support bidirectional iterators in the future.
|
||||
type UniIterator struct {
|
||||
iter *Iterator
|
||||
reversed bool
|
||||
}
|
||||
|
||||
// NewUniIterator returns a UniIterator.
|
||||
func (s *Skiplist) NewUniIterator(reversed bool) *UniIterator {
|
||||
return &UniIterator{
|
||||
iter: s.NewIterator(),
|
||||
reversed: reversed,
|
||||
}
|
||||
}
|
||||
|
||||
// Next implements y.Interface
|
||||
func (s *UniIterator) Next() {
|
||||
if !s.reversed {
|
||||
s.iter.Next()
|
||||
} else {
|
||||
s.iter.Prev()
|
||||
}
|
||||
}
|
||||
|
||||
// Rewind implements y.Interface
|
||||
func (s *UniIterator) Rewind() {
|
||||
if !s.reversed {
|
||||
s.iter.SeekToFirst()
|
||||
} else {
|
||||
s.iter.SeekToLast()
|
||||
}
|
||||
}
|
||||
|
||||
// Seek implements y.Interface
|
||||
func (s *UniIterator) Seek(key []byte) {
|
||||
if !s.reversed {
|
||||
s.iter.Seek(key)
|
||||
} else {
|
||||
s.iter.SeekForPrev(key)
|
||||
}
|
||||
}
|
||||
|
||||
// Key implements y.Interface
|
||||
func (s *UniIterator) Key() []byte { return s.iter.Key() }
|
||||
|
||||
// Value implements y.Interface
|
||||
func (s *UniIterator) Value() y.ValueStruct { return s.iter.Value() }
|
||||
|
||||
// Valid implements y.Interface
|
||||
func (s *UniIterator) Valid() bool { return s.iter.Valid() }
|
||||
|
||||
// Close implements y.Interface (and frees up the iter's resources)
|
||||
func (s *UniIterator) Close() error { return s.iter.Close() }
|
|
@ -0,0 +1,132 @@
|
|||
package badger
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"hash/crc32"
|
||||
|
||||
"github.com/dgraph-io/badger/y"
|
||||
)
|
||||
|
||||
type valuePointer struct {
|
||||
Fid uint32
|
||||
Len uint32
|
||||
Offset uint32
|
||||
}
|
||||
|
||||
func (p valuePointer) Less(o valuePointer) bool {
|
||||
if p.Fid != o.Fid {
|
||||
return p.Fid < o.Fid
|
||||
}
|
||||
if p.Offset != o.Offset {
|
||||
return p.Offset < o.Offset
|
||||
}
|
||||
return p.Len < o.Len
|
||||
}
|
||||
|
||||
func (p valuePointer) IsZero() bool {
|
||||
return p.Fid == 0 && p.Offset == 0 && p.Len == 0
|
||||
}
|
||||
|
||||
const vptrSize = 12
|
||||
|
||||
// Encode encodes Pointer into byte buffer.
|
||||
func (p valuePointer) Encode(b []byte) []byte {
|
||||
binary.BigEndian.PutUint32(b[:4], p.Fid)
|
||||
binary.BigEndian.PutUint32(b[4:8], p.Len)
|
||||
binary.BigEndian.PutUint32(b[8:12], p.Offset)
|
||||
return b[:vptrSize]
|
||||
}
|
||||
|
||||
func (p *valuePointer) Decode(b []byte) {
|
||||
p.Fid = binary.BigEndian.Uint32(b[:4])
|
||||
p.Len = binary.BigEndian.Uint32(b[4:8])
|
||||
p.Offset = binary.BigEndian.Uint32(b[8:12])
|
||||
}
|
||||
|
||||
// header is used in value log as a header before Entry.
|
||||
type header struct {
|
||||
klen uint32
|
||||
vlen uint32
|
||||
expiresAt uint64
|
||||
meta byte
|
||||
userMeta byte
|
||||
}
|
||||
|
||||
const (
|
||||
headerBufSize = 18
|
||||
)
|
||||
|
||||
func (h header) Encode(out []byte) {
|
||||
y.AssertTrue(len(out) >= headerBufSize)
|
||||
binary.BigEndian.PutUint32(out[0:4], h.klen)
|
||||
binary.BigEndian.PutUint32(out[4:8], h.vlen)
|
||||
binary.BigEndian.PutUint64(out[8:16], h.expiresAt)
|
||||
out[16] = h.meta
|
||||
out[17] = h.userMeta
|
||||
}
|
||||
|
||||
// Decodes h from buf.
|
||||
func (h *header) Decode(buf []byte) {
|
||||
h.klen = binary.BigEndian.Uint32(buf[0:4])
|
||||
h.vlen = binary.BigEndian.Uint32(buf[4:8])
|
||||
h.expiresAt = binary.BigEndian.Uint64(buf[8:16])
|
||||
h.meta = buf[16]
|
||||
h.userMeta = buf[17]
|
||||
}
|
||||
|
||||
// Entry provides Key, Value, UserMeta and ExpiresAt. This struct can be used by the user to set data.
|
||||
type Entry struct {
|
||||
Key []byte
|
||||
Value []byte
|
||||
UserMeta byte
|
||||
ExpiresAt uint64 // time.Unix
|
||||
meta byte
|
||||
|
||||
// Fields maintained internally.
|
||||
offset uint32
|
||||
}
|
||||
|
||||
func (e *Entry) estimateSize(threshold int) int {
|
||||
if len(e.Value) < threshold {
|
||||
return len(e.Key) + len(e.Value) + 2 // Meta, UserMeta
|
||||
}
|
||||
return len(e.Key) + 12 + 2 // 12 for ValuePointer, 2 for metas.
|
||||
}
|
||||
|
||||
// Encodes e to buf. Returns number of bytes written.
|
||||
func encodeEntry(e *Entry, buf *bytes.Buffer) (int, error) {
|
||||
h := header{
|
||||
klen: uint32(len(e.Key)),
|
||||
vlen: uint32(len(e.Value)),
|
||||
expiresAt: e.ExpiresAt,
|
||||
meta: e.meta,
|
||||
userMeta: e.UserMeta,
|
||||
}
|
||||
|
||||
var headerEnc [headerBufSize]byte
|
||||
h.Encode(headerEnc[:])
|
||||
|
||||
hash := crc32.New(y.CastagnoliCrcTable)
|
||||
|
||||
buf.Write(headerEnc[:])
|
||||
hash.Write(headerEnc[:])
|
||||
|
||||
buf.Write(e.Key)
|
||||
hash.Write(e.Key)
|
||||
|
||||
buf.Write(e.Value)
|
||||
hash.Write(e.Value)
|
||||
|
||||
var crcBuf [4]byte
|
||||
binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32())
|
||||
buf.Write(crcBuf[:])
|
||||
|
||||
return len(headerEnc) + len(e.Key) + len(e.Value) + len(crcBuf), nil
|
||||
}
|
||||
|
||||
func (e Entry) print(prefix string) {
|
||||
fmt.Printf("%s Key: %s Meta: %d UserMeta: %d Offset: %d len(val)=%d",
|
||||
prefix, e.Key, e.meta, e.UserMeta, e.offset, len(e.Value))
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
# BenchmarkRead
|
||||
|
||||
```
|
||||
$ go test -bench Read$ -count 3
|
||||
|
||||
Size of table: 105843444
|
||||
BenchmarkRead-8 3 343846914 ns/op
|
||||
BenchmarkRead-8 3 351790907 ns/op
|
||||
BenchmarkRead-8 3 351762823 ns/op
|
||||
```
|
||||
|
||||
Size of table is 105,843,444 bytes, which is ~101M.
|
||||
|
||||
The rate is ~287M/s which matches our read speed. This is using mmap.
|
||||
|
||||
To read a 64M table, this would take ~0.22s, which is negligible.
|
||||
|
||||
```
|
||||
$ go test -bench BenchmarkReadAndBuild -count 3
|
||||
|
||||
BenchmarkReadAndBuild-8 1 2341034225 ns/op
|
||||
BenchmarkReadAndBuild-8 1 2346349671 ns/op
|
||||
BenchmarkReadAndBuild-8 1 2364064576 ns/op
|
||||
```
|
||||
|
||||
The rate is ~43M/s. To build a ~64M table, this would take ~1.5s. Note that this
|
||||
does NOT include the flushing of the table to disk. All we are doing above is
|
||||
to read one table (mmaped) and write one table in memory.
|
||||
|
||||
The table building takes 1.5-0.22 ~ 1.3s.
|
||||
|
||||
If we are writing out up to 10 tables, this would take 1.5*10 ~ 15s, and ~13s
|
||||
is spent building the tables.
|
||||
|
||||
When running populate, building one table in memory tends to take ~1.5s to ~2.5s
|
||||
on my system. Where does this overhead come from? Let's investigate the merging.
|
||||
|
||||
Below, we merge 5 tables. The total size remains unchanged at ~101M.
|
||||
|
||||
```
|
||||
$ go test -bench ReadMerged -count 3
|
||||
BenchmarkReadMerged-8 1 1321190264 ns/op
|
||||
BenchmarkReadMerged-8 1 1296958737 ns/op
|
||||
BenchmarkReadMerged-8 1 1314381178 ns/op
|
||||
```
|
||||
|
||||
The rate is ~76M/s. To build a 64M table, this would take ~0.84s. The writing
|
||||
takes ~1.3s as we saw above. So in total, we expect around 0.84+1.3 ~ 2.1s.
|
||||
This roughly matches what we observe when running populate. There might be
|
||||
some additional overhead due to the concurrent writes going on, in flushing the
|
||||
table to disk. Also, the tables tend to be slightly bigger than 64M/s.
|
|
@ -0,0 +1,235 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package table
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"math"
|
||||
|
||||
"github.com/AndreasBriese/bbloom"
|
||||
"github.com/dgraph-io/badger/y"
|
||||
)
|
||||
|
||||
var (
|
||||
restartInterval = 100 // Might want to change this to be based on total size instead of numKeys.
|
||||
)
|
||||
|
||||
func newBuffer(sz int) *bytes.Buffer {
|
||||
b := new(bytes.Buffer)
|
||||
b.Grow(sz)
|
||||
return b
|
||||
}
|
||||
|
||||
type header struct {
|
||||
plen uint16 // Overlap with base key.
|
||||
klen uint16 // Length of the diff.
|
||||
vlen uint16 // Length of value.
|
||||
prev uint32 // Offset for the previous key-value pair. The offset is relative to block base offset.
|
||||
}
|
||||
|
||||
// Encode encodes the header.
|
||||
func (h header) Encode(b []byte) {
|
||||
binary.BigEndian.PutUint16(b[0:2], h.plen)
|
||||
binary.BigEndian.PutUint16(b[2:4], h.klen)
|
||||
binary.BigEndian.PutUint16(b[4:6], h.vlen)
|
||||
binary.BigEndian.PutUint32(b[6:10], h.prev)
|
||||
}
|
||||
|
||||
// Decode decodes the header.
|
||||
func (h *header) Decode(buf []byte) int {
|
||||
h.plen = binary.BigEndian.Uint16(buf[0:2])
|
||||
h.klen = binary.BigEndian.Uint16(buf[2:4])
|
||||
h.vlen = binary.BigEndian.Uint16(buf[4:6])
|
||||
h.prev = binary.BigEndian.Uint32(buf[6:10])
|
||||
return h.Size()
|
||||
}
|
||||
|
||||
// Size returns size of the header. Currently it's just a constant.
|
||||
func (h header) Size() int { return 10 }
|
||||
|
||||
// Builder is used in building a table.
|
||||
type Builder struct {
|
||||
counter int // Number of keys written for the current block.
|
||||
|
||||
// Typically tens or hundreds of meg. This is for one single file.
|
||||
buf *bytes.Buffer
|
||||
|
||||
baseKey []byte // Base key for the current block.
|
||||
baseOffset uint32 // Offset for the current block.
|
||||
|
||||
restarts []uint32 // Base offsets of every block.
|
||||
|
||||
// Tracks offset for the previous key-value pair. Offset is relative to block base offset.
|
||||
prevOffset uint32
|
||||
|
||||
keyBuf *bytes.Buffer
|
||||
keyCount int
|
||||
}
|
||||
|
||||
// NewTableBuilder makes a new TableBuilder.
|
||||
func NewTableBuilder() *Builder {
|
||||
return &Builder{
|
||||
keyBuf: newBuffer(1 << 20),
|
||||
buf: newBuffer(1 << 20),
|
||||
prevOffset: math.MaxUint32, // Used for the first element!
|
||||
}
|
||||
}
|
||||
|
||||
// Close closes the TableBuilder.
|
||||
func (b *Builder) Close() {}
|
||||
|
||||
// Empty returns whether it's empty.
|
||||
func (b *Builder) Empty() bool { return b.buf.Len() == 0 }
|
||||
|
||||
// keyDiff returns a suffix of newKey that is different from b.baseKey.
|
||||
func (b Builder) keyDiff(newKey []byte) []byte {
|
||||
var i int
|
||||
for i = 0; i < len(newKey) && i < len(b.baseKey); i++ {
|
||||
if newKey[i] != b.baseKey[i] {
|
||||
break
|
||||
}
|
||||
}
|
||||
return newKey[i:]
|
||||
}
|
||||
|
||||
func (b *Builder) addHelper(key []byte, v y.ValueStruct) {
|
||||
// Add key to bloom filter.
|
||||
if len(key) > 0 {
|
||||
var klen [2]byte
|
||||
keyNoTs := y.ParseKey(key)
|
||||
binary.BigEndian.PutUint16(klen[:], uint16(len(keyNoTs)))
|
||||
b.keyBuf.Write(klen[:])
|
||||
b.keyBuf.Write(keyNoTs)
|
||||
b.keyCount++
|
||||
}
|
||||
|
||||
// diffKey stores the difference of key with baseKey.
|
||||
var diffKey []byte
|
||||
if len(b.baseKey) == 0 {
|
||||
// Make a copy. Builder should not keep references. Otherwise, caller has to be very careful
|
||||
// and will have to make copies of keys every time they add to builder, which is even worse.
|
||||
b.baseKey = append(b.baseKey[:0], key...)
|
||||
diffKey = key
|
||||
} else {
|
||||
diffKey = b.keyDiff(key)
|
||||
}
|
||||
|
||||
h := header{
|
||||
plen: uint16(len(key) - len(diffKey)),
|
||||
klen: uint16(len(diffKey)),
|
||||
vlen: uint16(v.EncodedSize()),
|
||||
prev: b.prevOffset, // prevOffset is the location of the last key-value added.
|
||||
}
|
||||
b.prevOffset = uint32(b.buf.Len()) - b.baseOffset // Remember current offset for the next Add call.
|
||||
|
||||
// Layout: header, diffKey, value.
|
||||
var hbuf [10]byte
|
||||
h.Encode(hbuf[:])
|
||||
b.buf.Write(hbuf[:])
|
||||
b.buf.Write(diffKey) // We only need to store the key difference.
|
||||
|
||||
v.EncodeTo(b.buf)
|
||||
b.counter++ // Increment number of keys added for this current block.
|
||||
}
|
||||
|
||||
func (b *Builder) finishBlock() {
|
||||
// When we are at the end of the block and Valid=false, and the user wants to do a Prev,
|
||||
// we need a dummy header to tell us the offset of the previous key-value pair.
|
||||
b.addHelper([]byte{}, y.ValueStruct{})
|
||||
}
|
||||
|
||||
// Add adds a key-value pair to the block.
|
||||
// If doNotRestart is true, we will not restart even if b.counter >= restartInterval.
|
||||
func (b *Builder) Add(key []byte, value y.ValueStruct) error {
|
||||
if b.counter >= restartInterval {
|
||||
b.finishBlock()
|
||||
// Start a new block. Initialize the block.
|
||||
b.restarts = append(b.restarts, uint32(b.buf.Len()))
|
||||
b.counter = 0
|
||||
b.baseKey = []byte{}
|
||||
b.baseOffset = uint32(b.buf.Len())
|
||||
b.prevOffset = math.MaxUint32 // First key-value pair of block has header.prev=MaxInt.
|
||||
}
|
||||
b.addHelper(key, value)
|
||||
return nil // Currently, there is no meaningful error.
|
||||
}
|
||||
|
||||
// TODO: vvv this was the comment on ReachedCapacity.
|
||||
// FinalSize returns the *rough* final size of the array, counting the header which is not yet written.
|
||||
// TODO: Look into why there is a discrepancy. I suspect it is because of Write(empty, empty)
|
||||
// at the end. The diff can vary.
|
||||
|
||||
// ReachedCapacity returns true if we... roughly (?) reached capacity?
|
||||
func (b *Builder) ReachedCapacity(cap int64) bool {
|
||||
estimateSz := b.buf.Len() + 8 /* empty header */ + 4*len(b.restarts) + 8 // 8 = end of buf offset + len(restarts).
|
||||
return int64(estimateSz) > cap
|
||||
}
|
||||
|
||||
// blockIndex generates the block index for the table.
|
||||
// It is mainly a list of all the block base offsets.
|
||||
func (b *Builder) blockIndex() []byte {
|
||||
// Store the end offset, so we know the length of the final block.
|
||||
b.restarts = append(b.restarts, uint32(b.buf.Len()))
|
||||
|
||||
// Add 4 because we want to write out number of restarts at the end.
|
||||
sz := 4*len(b.restarts) + 4
|
||||
out := make([]byte, sz)
|
||||
buf := out
|
||||
for _, r := range b.restarts {
|
||||
binary.BigEndian.PutUint32(buf[:4], r)
|
||||
buf = buf[4:]
|
||||
}
|
||||
binary.BigEndian.PutUint32(buf[:4], uint32(len(b.restarts)))
|
||||
return out
|
||||
}
|
||||
|
||||
// Finish finishes the table by appending the index.
|
||||
func (b *Builder) Finish() []byte {
|
||||
bf := bbloom.New(float64(b.keyCount), 0.01)
|
||||
var klen [2]byte
|
||||
key := make([]byte, 1024)
|
||||
for {
|
||||
if _, err := b.keyBuf.Read(klen[:]); err == io.EOF {
|
||||
break
|
||||
} else if err != nil {
|
||||
y.Check(err)
|
||||
}
|
||||
kl := int(binary.BigEndian.Uint16(klen[:]))
|
||||
if cap(key) < kl {
|
||||
key = make([]byte, 2*int(kl)) // 2 * uint16 will overflow
|
||||
}
|
||||
key = key[:kl]
|
||||
y.Check2(b.keyBuf.Read(key))
|
||||
bf.Add(key)
|
||||
}
|
||||
|
||||
b.finishBlock() // This will never start a new block.
|
||||
index := b.blockIndex()
|
||||
b.buf.Write(index)
|
||||
|
||||
// Write bloom filter.
|
||||
bdata := bf.JSONMarshal()
|
||||
n, err := b.buf.Write(bdata)
|
||||
y.Check(err)
|
||||
var buf [4]byte
|
||||
binary.BigEndian.PutUint32(buf[:], uint32(n))
|
||||
b.buf.Write(buf[:])
|
||||
|
||||
return b.buf.Bytes()
|
||||
}
|
|
@ -0,0 +1,539 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package table
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/dgraph-io/badger/y"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
type blockIterator struct {
|
||||
data []byte
|
||||
pos uint32
|
||||
err error
|
||||
baseKey []byte
|
||||
|
||||
key []byte
|
||||
val []byte
|
||||
init bool
|
||||
|
||||
last header // The last header we saw.
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Reset() {
|
||||
itr.pos = 0
|
||||
itr.err = nil
|
||||
itr.baseKey = []byte{}
|
||||
itr.key = []byte{}
|
||||
itr.val = []byte{}
|
||||
itr.init = false
|
||||
itr.last = header{}
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Init() {
|
||||
if !itr.init {
|
||||
itr.Next()
|
||||
}
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Valid() bool {
|
||||
return itr != nil && itr.err == nil
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Error() error {
|
||||
return itr.err
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Close() {}
|
||||
|
||||
var (
|
||||
origin = 0
|
||||
current = 1
|
||||
)
|
||||
|
||||
// Seek brings us to the first block element that is >= input key.
|
||||
func (itr *blockIterator) Seek(key []byte, whence int) {
|
||||
itr.err = nil
|
||||
|
||||
switch whence {
|
||||
case origin:
|
||||
itr.Reset()
|
||||
case current:
|
||||
}
|
||||
|
||||
var done bool
|
||||
for itr.Init(); itr.Valid(); itr.Next() {
|
||||
k := itr.Key()
|
||||
if y.CompareKeys(k, key) >= 0 {
|
||||
// We are done as k is >= key.
|
||||
done = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !done {
|
||||
itr.err = io.EOF
|
||||
}
|
||||
}
|
||||
|
||||
func (itr *blockIterator) SeekToFirst() {
|
||||
itr.err = nil
|
||||
itr.Init()
|
||||
}
|
||||
|
||||
// SeekToLast brings us to the last element. Valid should return true.
|
||||
func (itr *blockIterator) SeekToLast() {
|
||||
itr.err = nil
|
||||
for itr.Init(); itr.Valid(); itr.Next() {
|
||||
}
|
||||
itr.Prev()
|
||||
}
|
||||
|
||||
// parseKV would allocate a new byte slice for key and for value.
|
||||
func (itr *blockIterator) parseKV(h header) {
|
||||
if cap(itr.key) < int(h.plen+h.klen) {
|
||||
sz := int(h.plen) + int(h.klen) // Convert to int before adding to avoid uint16 overflow.
|
||||
itr.key = make([]byte, 2*sz)
|
||||
}
|
||||
itr.key = itr.key[:h.plen+h.klen]
|
||||
copy(itr.key, itr.baseKey[:h.plen])
|
||||
copy(itr.key[h.plen:], itr.data[itr.pos:itr.pos+uint32(h.klen)])
|
||||
itr.pos += uint32(h.klen)
|
||||
|
||||
if itr.pos+uint32(h.vlen) > uint32(len(itr.data)) {
|
||||
itr.err = errors.Errorf("Value exceeded size of block: %d %d %d %d %v",
|
||||
itr.pos, h.klen, h.vlen, len(itr.data), h)
|
||||
return
|
||||
}
|
||||
itr.val = y.SafeCopy(itr.val, itr.data[itr.pos:itr.pos+uint32(h.vlen)])
|
||||
itr.pos += uint32(h.vlen)
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Next() {
|
||||
itr.init = true
|
||||
itr.err = nil
|
||||
if itr.pos >= uint32(len(itr.data)) {
|
||||
itr.err = io.EOF
|
||||
return
|
||||
}
|
||||
|
||||
var h header
|
||||
itr.pos += uint32(h.Decode(itr.data[itr.pos:]))
|
||||
itr.last = h // Store the last header.
|
||||
|
||||
if h.klen == 0 && h.plen == 0 {
|
||||
// Last entry in the table.
|
||||
itr.err = io.EOF
|
||||
return
|
||||
}
|
||||
|
||||
// Populate baseKey if it isn't set yet. This would only happen for the first Next.
|
||||
if len(itr.baseKey) == 0 {
|
||||
// This should be the first Next() for this block. Hence, prefix length should be zero.
|
||||
y.AssertTrue(h.plen == 0)
|
||||
itr.baseKey = itr.data[itr.pos : itr.pos+uint32(h.klen)]
|
||||
}
|
||||
itr.parseKV(h)
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Prev() {
|
||||
if !itr.init {
|
||||
return
|
||||
}
|
||||
itr.err = nil
|
||||
if itr.last.prev == math.MaxUint32 {
|
||||
// This is the first element of the block!
|
||||
itr.err = io.EOF
|
||||
itr.pos = 0
|
||||
return
|
||||
}
|
||||
|
||||
// Move back using current header's prev.
|
||||
itr.pos = itr.last.prev
|
||||
|
||||
var h header
|
||||
y.AssertTruef(itr.pos < uint32(len(itr.data)), "%d %d", itr.pos, len(itr.data))
|
||||
itr.pos += uint32(h.Decode(itr.data[itr.pos:]))
|
||||
itr.parseKV(h)
|
||||
itr.last = h
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Key() []byte {
|
||||
if itr.err != nil {
|
||||
return nil
|
||||
}
|
||||
return itr.key
|
||||
}
|
||||
|
||||
func (itr *blockIterator) Value() []byte {
|
||||
if itr.err != nil {
|
||||
return nil
|
||||
}
|
||||
return itr.val
|
||||
}
|
||||
|
||||
// Iterator is an iterator for a Table.
|
||||
type Iterator struct {
|
||||
t *Table
|
||||
bpos int
|
||||
bi *blockIterator
|
||||
err error
|
||||
|
||||
// Internally, Iterator is bidirectional. However, we only expose the
|
||||
// unidirectional functionality for now.
|
||||
reversed bool
|
||||
}
|
||||
|
||||
// NewIterator returns a new iterator of the Table
|
||||
func (t *Table) NewIterator(reversed bool) *Iterator {
|
||||
t.IncrRef() // Important.
|
||||
ti := &Iterator{t: t, reversed: reversed}
|
||||
ti.next()
|
||||
return ti
|
||||
}
|
||||
|
||||
// Close closes the iterator (and it must be called).
|
||||
func (itr *Iterator) Close() error {
|
||||
return itr.t.DecrRef()
|
||||
}
|
||||
|
||||
func (itr *Iterator) reset() {
|
||||
itr.bpos = 0
|
||||
itr.err = nil
|
||||
}
|
||||
|
||||
// Valid follows the y.Iterator interface
|
||||
func (itr *Iterator) Valid() bool {
|
||||
return itr.err == nil
|
||||
}
|
||||
|
||||
func (itr *Iterator) seekToFirst() {
|
||||
numBlocks := len(itr.t.blockIndex)
|
||||
if numBlocks == 0 {
|
||||
itr.err = io.EOF
|
||||
return
|
||||
}
|
||||
itr.bpos = 0
|
||||
block, err := itr.t.block(itr.bpos)
|
||||
if err != nil {
|
||||
itr.err = err
|
||||
return
|
||||
}
|
||||
itr.bi = block.NewIterator()
|
||||
itr.bi.SeekToFirst()
|
||||
itr.err = itr.bi.Error()
|
||||
}
|
||||
|
||||
func (itr *Iterator) seekToLast() {
|
||||
numBlocks := len(itr.t.blockIndex)
|
||||
if numBlocks == 0 {
|
||||
itr.err = io.EOF
|
||||
return
|
||||
}
|
||||
itr.bpos = numBlocks - 1
|
||||
block, err := itr.t.block(itr.bpos)
|
||||
if err != nil {
|
||||
itr.err = err
|
||||
return
|
||||
}
|
||||
itr.bi = block.NewIterator()
|
||||
itr.bi.SeekToLast()
|
||||
itr.err = itr.bi.Error()
|
||||
}
|
||||
|
||||
func (itr *Iterator) seekHelper(blockIdx int, key []byte) {
|
||||
itr.bpos = blockIdx
|
||||
block, err := itr.t.block(blockIdx)
|
||||
if err != nil {
|
||||
itr.err = err
|
||||
return
|
||||
}
|
||||
itr.bi = block.NewIterator()
|
||||
itr.bi.Seek(key, origin)
|
||||
itr.err = itr.bi.Error()
|
||||
}
|
||||
|
||||
// seekFrom brings us to a key that is >= input key.
|
||||
func (itr *Iterator) seekFrom(key []byte, whence int) {
|
||||
itr.err = nil
|
||||
switch whence {
|
||||
case origin:
|
||||
itr.reset()
|
||||
case current:
|
||||
}
|
||||
|
||||
idx := sort.Search(len(itr.t.blockIndex), func(idx int) bool {
|
||||
ko := itr.t.blockIndex[idx]
|
||||
return y.CompareKeys(ko.key, key) > 0
|
||||
})
|
||||
if idx == 0 {
|
||||
// The smallest key in our table is already strictly > key. We can return that.
|
||||
// This is like a SeekToFirst.
|
||||
itr.seekHelper(0, key)
|
||||
return
|
||||
}
|
||||
|
||||
// block[idx].smallest is > key.
|
||||
// Since idx>0, we know block[idx-1].smallest is <= key.
|
||||
// There are two cases.
|
||||
// 1) Everything in block[idx-1] is strictly < key. In this case, we should go to the first
|
||||
// element of block[idx].
|
||||
// 2) Some element in block[idx-1] is >= key. We should go to that element.
|
||||
itr.seekHelper(idx-1, key)
|
||||
if itr.err == io.EOF {
|
||||
// Case 1. Need to visit block[idx].
|
||||
if idx == len(itr.t.blockIndex) {
|
||||
// If idx == len(itr.t.blockIndex), then input key is greater than ANY element of table.
|
||||
// There's nothing we can do. Valid() should return false as we seek to end of table.
|
||||
return
|
||||
}
|
||||
// Since block[idx].smallest is > key. This is essentially a block[idx].SeekToFirst.
|
||||
itr.seekHelper(idx, key)
|
||||
}
|
||||
// Case 2: No need to do anything. We already did the seek in block[idx-1].
|
||||
}
|
||||
|
||||
// seek will reset iterator and seek to >= key.
|
||||
func (itr *Iterator) seek(key []byte) {
|
||||
itr.seekFrom(key, origin)
|
||||
}
|
||||
|
||||
// seekForPrev will reset iterator and seek to <= key.
|
||||
func (itr *Iterator) seekForPrev(key []byte) {
|
||||
// TODO: Optimize this. We shouldn't have to take a Prev step.
|
||||
itr.seekFrom(key, origin)
|
||||
if !bytes.Equal(itr.Key(), key) {
|
||||
itr.prev()
|
||||
}
|
||||
}
|
||||
|
||||
func (itr *Iterator) next() {
|
||||
itr.err = nil
|
||||
|
||||
if itr.bpos >= len(itr.t.blockIndex) {
|
||||
itr.err = io.EOF
|
||||
return
|
||||
}
|
||||
|
||||
if itr.bi == nil {
|
||||
block, err := itr.t.block(itr.bpos)
|
||||
if err != nil {
|
||||
itr.err = err
|
||||
return
|
||||
}
|
||||
itr.bi = block.NewIterator()
|
||||
itr.bi.SeekToFirst()
|
||||
itr.err = itr.bi.Error()
|
||||
return
|
||||
}
|
||||
|
||||
itr.bi.Next()
|
||||
if !itr.bi.Valid() {
|
||||
itr.bpos++
|
||||
itr.bi = nil
|
||||
itr.next()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func (itr *Iterator) prev() {
|
||||
itr.err = nil
|
||||
if itr.bpos < 0 {
|
||||
itr.err = io.EOF
|
||||
return
|
||||
}
|
||||
|
||||
if itr.bi == nil {
|
||||
block, err := itr.t.block(itr.bpos)
|
||||
if err != nil {
|
||||
itr.err = err
|
||||
return
|
||||
}
|
||||
itr.bi = block.NewIterator()
|
||||
itr.bi.SeekToLast()
|
||||
itr.err = itr.bi.Error()
|
||||
return
|
||||
}
|
||||
|
||||
itr.bi.Prev()
|
||||
if !itr.bi.Valid() {
|
||||
itr.bpos--
|
||||
itr.bi = nil
|
||||
itr.prev()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Key follows the y.Iterator interface
|
||||
func (itr *Iterator) Key() []byte {
|
||||
return itr.bi.Key()
|
||||
}
|
||||
|
||||
// Value follows the y.Iterator interface
|
||||
func (itr *Iterator) Value() (ret y.ValueStruct) {
|
||||
ret.Decode(itr.bi.Value())
|
||||
return
|
||||
}
|
||||
|
||||
// Next follows the y.Iterator interface
|
||||
func (itr *Iterator) Next() {
|
||||
if !itr.reversed {
|
||||
itr.next()
|
||||
} else {
|
||||
itr.prev()
|
||||
}
|
||||
}
|
||||
|
||||
// Rewind follows the y.Iterator interface
|
||||
func (itr *Iterator) Rewind() {
|
||||
if !itr.reversed {
|
||||
itr.seekToFirst()
|
||||
} else {
|
||||
itr.seekToLast()
|
||||
}
|
||||
}
|
||||
|
||||
// Seek follows the y.Iterator interface
|
||||
func (itr *Iterator) Seek(key []byte) {
|
||||
if !itr.reversed {
|
||||
itr.seek(key)
|
||||
} else {
|
||||
itr.seekForPrev(key)
|
||||
}
|
||||
}
|
||||
|
||||
// ConcatIterator concatenates the sequences defined by several iterators. (It only works with
|
||||
// TableIterators, probably just because it's faster to not be so generic.)
|
||||
type ConcatIterator struct {
|
||||
idx int // Which iterator is active now.
|
||||
cur *Iterator
|
||||
iters []*Iterator // Corresponds to tables.
|
||||
tables []*Table // Disregarding reversed, this is in ascending order.
|
||||
reversed bool
|
||||
}
|
||||
|
||||
// NewConcatIterator creates a new concatenated iterator
|
||||
func NewConcatIterator(tbls []*Table, reversed bool) *ConcatIterator {
|
||||
iters := make([]*Iterator, len(tbls))
|
||||
for i := 0; i < len(tbls); i++ {
|
||||
iters[i] = tbls[i].NewIterator(reversed)
|
||||
}
|
||||
return &ConcatIterator{
|
||||
reversed: reversed,
|
||||
iters: iters,
|
||||
tables: tbls,
|
||||
idx: -1, // Not really necessary because s.it.Valid()=false, but good to have.
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ConcatIterator) setIdx(idx int) {
|
||||
s.idx = idx
|
||||
if idx < 0 || idx >= len(s.iters) {
|
||||
s.cur = nil
|
||||
} else {
|
||||
s.cur = s.iters[s.idx]
|
||||
}
|
||||
}
|
||||
|
||||
// Rewind implements y.Interface
|
||||
func (s *ConcatIterator) Rewind() {
|
||||
if len(s.iters) == 0 {
|
||||
return
|
||||
}
|
||||
if !s.reversed {
|
||||
s.setIdx(0)
|
||||
} else {
|
||||
s.setIdx(len(s.iters) - 1)
|
||||
}
|
||||
s.cur.Rewind()
|
||||
}
|
||||
|
||||
// Valid implements y.Interface
|
||||
func (s *ConcatIterator) Valid() bool {
|
||||
return s.cur != nil && s.cur.Valid()
|
||||
}
|
||||
|
||||
// Key implements y.Interface
|
||||
func (s *ConcatIterator) Key() []byte {
|
||||
return s.cur.Key()
|
||||
}
|
||||
|
||||
// Value implements y.Interface
|
||||
func (s *ConcatIterator) Value() y.ValueStruct {
|
||||
return s.cur.Value()
|
||||
}
|
||||
|
||||
// Seek brings us to element >= key if reversed is false. Otherwise, <= key.
|
||||
func (s *ConcatIterator) Seek(key []byte) {
|
||||
var idx int
|
||||
if !s.reversed {
|
||||
idx = sort.Search(len(s.tables), func(i int) bool {
|
||||
return y.CompareKeys(s.tables[i].Biggest(), key) >= 0
|
||||
})
|
||||
} else {
|
||||
n := len(s.tables)
|
||||
idx = n - 1 - sort.Search(n, func(i int) bool {
|
||||
return y.CompareKeys(s.tables[n-1-i].Smallest(), key) <= 0
|
||||
})
|
||||
}
|
||||
if idx >= len(s.tables) || idx < 0 {
|
||||
s.setIdx(-1)
|
||||
return
|
||||
}
|
||||
// For reversed=false, we know s.tables[i-1].Biggest() < key. Thus, the
|
||||
// previous table cannot possibly contain key.
|
||||
s.setIdx(idx)
|
||||
s.cur.Seek(key)
|
||||
}
|
||||
|
||||
// Next advances our concat iterator.
|
||||
func (s *ConcatIterator) Next() {
|
||||
s.cur.Next()
|
||||
if s.cur.Valid() {
|
||||
// Nothing to do. Just stay with the current table.
|
||||
return
|
||||
}
|
||||
for { // In case there are empty tables.
|
||||
if !s.reversed {
|
||||
s.setIdx(s.idx + 1)
|
||||
} else {
|
||||
s.setIdx(s.idx - 1)
|
||||
}
|
||||
if s.cur == nil {
|
||||
// End of list. Valid will become false.
|
||||
return
|
||||
}
|
||||
s.cur.Rewind()
|
||||
if s.cur.Valid() {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close implements y.Interface.
|
||||
func (s *ConcatIterator) Close() error {
|
||||
for _, it := range s.iters {
|
||||
if err := it.Close(); err != nil {
|
||||
return errors.Wrap(err, "ConcatIterator")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,359 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package table
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/AndreasBriese/bbloom"
|
||||
"github.com/dgraph-io/badger/options"
|
||||
"github.com/dgraph-io/badger/y"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
const fileSuffix = ".sst"
|
||||
|
||||
type keyOffset struct {
|
||||
key []byte
|
||||
offset int
|
||||
len int
|
||||
}
|
||||
|
||||
// Table represents a loaded table file with the info we have about it
|
||||
type Table struct {
|
||||
sync.Mutex
|
||||
|
||||
fd *os.File // Own fd.
|
||||
tableSize int // Initialized in OpenTable, using fd.Stat().
|
||||
|
||||
blockIndex []keyOffset
|
||||
ref int32 // For file garbage collection. Atomic.
|
||||
|
||||
loadingMode options.FileLoadingMode
|
||||
mmap []byte // Memory mapped.
|
||||
|
||||
// The following are initialized once and const.
|
||||
smallest, biggest []byte // Smallest and largest keys.
|
||||
id uint64 // file id, part of filename
|
||||
|
||||
bf bbloom.Bloom
|
||||
}
|
||||
|
||||
// IncrRef increments the refcount (having to do with whether the file should be deleted)
|
||||
func (t *Table) IncrRef() {
|
||||
atomic.AddInt32(&t.ref, 1)
|
||||
}
|
||||
|
||||
// DecrRef decrements the refcount and possibly deletes the table
|
||||
func (t *Table) DecrRef() error {
|
||||
newRef := atomic.AddInt32(&t.ref, -1)
|
||||
if newRef == 0 {
|
||||
// We can safely delete this file, because for all the current files, we always have
|
||||
// at least one reference pointing to them.
|
||||
|
||||
// It's necessary to delete windows files
|
||||
if t.loadingMode == options.MemoryMap {
|
||||
y.Munmap(t.mmap)
|
||||
}
|
||||
if err := t.fd.Truncate(0); err != nil {
|
||||
// This is very important to let the FS know that the file is deleted.
|
||||
return err
|
||||
}
|
||||
filename := t.fd.Name()
|
||||
if err := t.fd.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.Remove(filename); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type block struct {
|
||||
offset int
|
||||
data []byte
|
||||
}
|
||||
|
||||
func (b block) NewIterator() *blockIterator {
|
||||
return &blockIterator{data: b.data}
|
||||
}
|
||||
|
||||
// OpenTable assumes file has only one table and opens it. Takes ownership of fd upon function
|
||||
// entry. Returns a table with one reference count on it (decrementing which may delete the file!
|
||||
// -- consider t.Close() instead). The fd has to writeable because we call Truncate on it before
|
||||
// deleting.
|
||||
func OpenTable(fd *os.File, loadingMode options.FileLoadingMode) (*Table, error) {
|
||||
fileInfo, err := fd.Stat()
|
||||
if err != nil {
|
||||
// It's OK to ignore fd.Close() errs in this function because we have only read
|
||||
// from the file.
|
||||
_ = fd.Close()
|
||||
return nil, y.Wrap(err)
|
||||
}
|
||||
|
||||
filename := fileInfo.Name()
|
||||
id, ok := ParseFileID(filename)
|
||||
if !ok {
|
||||
_ = fd.Close()
|
||||
return nil, errors.Errorf("Invalid filename: %s", filename)
|
||||
}
|
||||
t := &Table{
|
||||
fd: fd,
|
||||
ref: 1, // Caller is given one reference.
|
||||
id: id,
|
||||
loadingMode: loadingMode,
|
||||
}
|
||||
|
||||
t.tableSize = int(fileInfo.Size())
|
||||
|
||||
if loadingMode == options.MemoryMap {
|
||||
t.mmap, err = y.Mmap(fd, false, fileInfo.Size())
|
||||
if err != nil {
|
||||
_ = fd.Close()
|
||||
return nil, y.Wrapf(err, "Unable to map file")
|
||||
}
|
||||
} else if loadingMode == options.LoadToRAM {
|
||||
err = t.loadToRAM()
|
||||
if err != nil {
|
||||
_ = fd.Close()
|
||||
return nil, y.Wrap(err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := t.readIndex(); err != nil {
|
||||
return nil, y.Wrap(err)
|
||||
}
|
||||
|
||||
it := t.NewIterator(false)
|
||||
defer it.Close()
|
||||
it.Rewind()
|
||||
if it.Valid() {
|
||||
t.smallest = it.Key()
|
||||
}
|
||||
|
||||
it2 := t.NewIterator(true)
|
||||
defer it2.Close()
|
||||
it2.Rewind()
|
||||
if it2.Valid() {
|
||||
t.biggest = it2.Key()
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// Close closes the open table. (Releases resources back to the OS.)
|
||||
func (t *Table) Close() error {
|
||||
if t.loadingMode == options.MemoryMap {
|
||||
y.Munmap(t.mmap)
|
||||
}
|
||||
|
||||
return t.fd.Close()
|
||||
}
|
||||
|
||||
func (t *Table) read(off int, sz int) ([]byte, error) {
|
||||
if len(t.mmap) > 0 {
|
||||
if len(t.mmap[off:]) < sz {
|
||||
return nil, y.ErrEOF
|
||||
}
|
||||
return t.mmap[off : off+sz], nil
|
||||
}
|
||||
|
||||
res := make([]byte, sz)
|
||||
nbr, err := t.fd.ReadAt(res, int64(off))
|
||||
y.NumReads.Add(1)
|
||||
y.NumBytesRead.Add(int64(nbr))
|
||||
return res, err
|
||||
}
|
||||
|
||||
func (t *Table) readNoFail(off int, sz int) []byte {
|
||||
res, err := t.read(off, sz)
|
||||
y.Check(err)
|
||||
return res
|
||||
}
|
||||
|
||||
func (t *Table) readIndex() error {
|
||||
readPos := t.tableSize
|
||||
|
||||
// Read bloom filter.
|
||||
readPos -= 4
|
||||
buf := t.readNoFail(readPos, 4)
|
||||
bloomLen := int(binary.BigEndian.Uint32(buf))
|
||||
readPos -= bloomLen
|
||||
data := t.readNoFail(readPos, bloomLen)
|
||||
t.bf = bbloom.JSONUnmarshal(data)
|
||||
|
||||
readPos -= 4
|
||||
buf = t.readNoFail(readPos, 4)
|
||||
restartsLen := int(binary.BigEndian.Uint32(buf))
|
||||
|
||||
readPos -= 4 * restartsLen
|
||||
buf = t.readNoFail(readPos, 4*restartsLen)
|
||||
|
||||
offsets := make([]int, restartsLen)
|
||||
for i := 0; i < restartsLen; i++ {
|
||||
offsets[i] = int(binary.BigEndian.Uint32(buf[:4]))
|
||||
buf = buf[4:]
|
||||
}
|
||||
|
||||
// The last offset stores the end of the last block.
|
||||
for i := 0; i < len(offsets); i++ {
|
||||
var o int
|
||||
if i == 0 {
|
||||
o = 0
|
||||
} else {
|
||||
o = offsets[i-1]
|
||||
}
|
||||
|
||||
ko := keyOffset{
|
||||
offset: o,
|
||||
len: offsets[i] - o,
|
||||
}
|
||||
t.blockIndex = append(t.blockIndex, ko)
|
||||
}
|
||||
|
||||
che := make(chan error, len(t.blockIndex))
|
||||
blocks := make(chan int, len(t.blockIndex))
|
||||
|
||||
for i := 0; i < len(t.blockIndex); i++ {
|
||||
blocks <- i
|
||||
}
|
||||
|
||||
for i := 0; i < 64; i++ { // Run 64 goroutines.
|
||||
go func() {
|
||||
var h header
|
||||
|
||||
for index := range blocks {
|
||||
ko := &t.blockIndex[index]
|
||||
|
||||
offset := ko.offset
|
||||
buf, err := t.read(offset, h.Size())
|
||||
if err != nil {
|
||||
che <- errors.Wrap(err, "While reading first header in block")
|
||||
continue
|
||||
}
|
||||
|
||||
h.Decode(buf)
|
||||
y.AssertTruef(h.plen == 0, "Key offset: %+v, h.plen = %d", *ko, h.plen)
|
||||
|
||||
offset += h.Size()
|
||||
buf = make([]byte, h.klen)
|
||||
var out []byte
|
||||
if out, err = t.read(offset, int(h.klen)); err != nil {
|
||||
che <- errors.Wrap(err, "While reading first key in block")
|
||||
continue
|
||||
}
|
||||
y.AssertTrue(len(buf) == copy(buf, out))
|
||||
|
||||
ko.key = buf
|
||||
che <- nil
|
||||
}
|
||||
}()
|
||||
}
|
||||
close(blocks) // to stop reading goroutines
|
||||
|
||||
var readError error
|
||||
for i := 0; i < len(t.blockIndex); i++ {
|
||||
if err := <-che; err != nil && readError == nil {
|
||||
readError = err
|
||||
}
|
||||
}
|
||||
if readError != nil {
|
||||
return readError
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Table) block(idx int) (block, error) {
|
||||
y.AssertTruef(idx >= 0, "idx=%d", idx)
|
||||
if idx >= len(t.blockIndex) {
|
||||
return block{}, errors.New("block out of index")
|
||||
}
|
||||
|
||||
ko := t.blockIndex[idx]
|
||||
blk := block{
|
||||
offset: ko.offset,
|
||||
}
|
||||
var err error
|
||||
blk.data, err = t.read(blk.offset, ko.len)
|
||||
return blk, err
|
||||
}
|
||||
|
||||
// Size is its file size in bytes
|
||||
func (t *Table) Size() int64 { return int64(t.tableSize) }
|
||||
|
||||
// Smallest is its smallest key, or nil if there are none
|
||||
func (t *Table) Smallest() []byte { return t.smallest }
|
||||
|
||||
// Biggest is its biggest key, or nil if there are none
|
||||
func (t *Table) Biggest() []byte { return t.biggest }
|
||||
|
||||
// Filename is NOT the file name. Just kidding, it is.
|
||||
func (t *Table) Filename() string { return t.fd.Name() }
|
||||
|
||||
// ID is the table's ID number (used to make the file name).
|
||||
func (t *Table) ID() uint64 { return t.id }
|
||||
|
||||
// DoesNotHave returns true if (but not "only if") the table does not have the key. It does a
|
||||
// bloom filter lookup.
|
||||
func (t *Table) DoesNotHave(key []byte) bool { return !t.bf.Has(key) }
|
||||
|
||||
// ParseFileID reads the file id out of a filename.
|
||||
func ParseFileID(name string) (uint64, bool) {
|
||||
name = path.Base(name)
|
||||
if !strings.HasSuffix(name, fileSuffix) {
|
||||
return 0, false
|
||||
}
|
||||
// suffix := name[len(fileSuffix):]
|
||||
name = strings.TrimSuffix(name, fileSuffix)
|
||||
id, err := strconv.Atoi(name)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
y.AssertTrue(id >= 0)
|
||||
return uint64(id), true
|
||||
}
|
||||
|
||||
// IDToFilename does the inverse of ParseFileID
|
||||
func IDToFilename(id uint64) string {
|
||||
return fmt.Sprintf("%06d", id) + fileSuffix
|
||||
}
|
||||
|
||||
// NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table
|
||||
// filepath.
|
||||
func NewFilename(id uint64, dir string) string {
|
||||
return filepath.Join(dir, IDToFilename(id))
|
||||
}
|
||||
|
||||
func (t *Table) loadToRAM() error {
|
||||
t.mmap = make([]byte, t.tableSize)
|
||||
read, err := t.fd.ReadAt(t.mmap, 0)
|
||||
if err != nil || read != t.tableSize {
|
||||
return y.Wrapf(err, "Unable to load file in memory. Table file: %s", t.Filename())
|
||||
}
|
||||
y.NumReads.Add(1)
|
||||
y.NumBytesRead.Add(int64(read))
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
l=$(go list ./...)
|
||||
for x in $l; do
|
||||
echo "Testing package $x"
|
||||
go test -v $x
|
||||
done
|
|
@ -0,0 +1,549 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"math"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/dgraph-io/badger/y"
|
||||
farm "github.com/dgryski/go-farm"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
const (
|
||||
deleteItem = iota
|
||||
setItem
|
||||
)
|
||||
|
||||
type uint64Heap []uint64
|
||||
|
||||
func (u uint64Heap) Len() int { return len(u) }
|
||||
func (u uint64Heap) Less(i int, j int) bool { return u[i] < u[j] }
|
||||
func (u uint64Heap) Swap(i int, j int) { u[i], u[j] = u[j], u[i] }
|
||||
func (u *uint64Heap) Push(x interface{}) { *u = append(*u, x.(uint64)) }
|
||||
func (u *uint64Heap) Pop() interface{} {
|
||||
old := *u
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*u = old[0 : n-1]
|
||||
return x
|
||||
}
|
||||
|
||||
type oracle struct {
|
||||
curRead uint64 // Managed by the mutex.
|
||||
refCount int64
|
||||
isManaged bool // Does not change value, so no locking required.
|
||||
|
||||
sync.Mutex
|
||||
writeLock sync.Mutex
|
||||
nextCommit uint64
|
||||
|
||||
// commits stores a key fingerprint and latest commit counter for it.
|
||||
// refCount is used to clear out commits map to avoid a memory blowup.
|
||||
commits map[uint64]uint64
|
||||
}
|
||||
|
||||
func (o *oracle) addRef() {
|
||||
atomic.AddInt64(&o.refCount, 1)
|
||||
}
|
||||
|
||||
func (o *oracle) decrRef() {
|
||||
if count := atomic.AddInt64(&o.refCount, -1); count == 0 {
|
||||
// Clear out commits maps to release memory.
|
||||
o.Lock()
|
||||
// Avoids the race where something new is added to commitsMap
|
||||
// after we check refCount and before we take Lock.
|
||||
if atomic.LoadInt64(&o.refCount) != 0 {
|
||||
o.Unlock()
|
||||
return
|
||||
}
|
||||
if len(o.commits) >= 1000 { // If the map is still small, let it slide.
|
||||
o.commits = make(map[uint64]uint64)
|
||||
}
|
||||
o.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
func (o *oracle) readTs() uint64 {
|
||||
if o.isManaged {
|
||||
return math.MaxUint64
|
||||
}
|
||||
return atomic.LoadUint64(&o.curRead)
|
||||
}
|
||||
|
||||
func (o *oracle) commitTs() uint64 {
|
||||
o.Lock()
|
||||
defer o.Unlock()
|
||||
return o.nextCommit
|
||||
}
|
||||
|
||||
// hasConflict must be called while having a lock.
|
||||
func (o *oracle) hasConflict(txn *Txn) bool {
|
||||
if len(txn.reads) == 0 {
|
||||
return false
|
||||
}
|
||||
for _, ro := range txn.reads {
|
||||
if ts, has := o.commits[ro]; has && ts > txn.readTs {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (o *oracle) newCommitTs(txn *Txn) uint64 {
|
||||
o.Lock()
|
||||
defer o.Unlock()
|
||||
|
||||
if o.hasConflict(txn) {
|
||||
return 0
|
||||
}
|
||||
|
||||
var ts uint64
|
||||
if !o.isManaged {
|
||||
// This is the general case, when user doesn't specify the read and commit ts.
|
||||
ts = o.nextCommit
|
||||
o.nextCommit++
|
||||
|
||||
} else {
|
||||
// If commitTs is set, use it instead.
|
||||
ts = txn.commitTs
|
||||
}
|
||||
|
||||
for _, w := range txn.writes {
|
||||
o.commits[w] = ts // Update the commitTs.
|
||||
}
|
||||
return ts
|
||||
}
|
||||
|
||||
func (o *oracle) doneCommit(cts uint64) {
|
||||
if o.isManaged {
|
||||
// No need to update anything.
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
curRead := atomic.LoadUint64(&o.curRead)
|
||||
if cts <= curRead {
|
||||
return
|
||||
}
|
||||
atomic.CompareAndSwapUint64(&o.curRead, curRead, cts)
|
||||
}
|
||||
}
|
||||
|
||||
// Txn represents a Badger transaction.
|
||||
type Txn struct {
|
||||
readTs uint64
|
||||
commitTs uint64
|
||||
|
||||
update bool // update is used to conditionally keep track of reads.
|
||||
reads []uint64 // contains fingerprints of keys read.
|
||||
writes []uint64 // contains fingerprints of keys written.
|
||||
|
||||
pendingWrites map[string]*Entry // cache stores any writes done by txn.
|
||||
|
||||
db *DB
|
||||
callbacks []func()
|
||||
discarded bool
|
||||
|
||||
size int64
|
||||
count int64
|
||||
}
|
||||
|
||||
type pendingWritesIterator struct {
|
||||
entries []*Entry
|
||||
nextIdx int
|
||||
readTs uint64
|
||||
reversed bool
|
||||
}
|
||||
|
||||
func (pi *pendingWritesIterator) Next() {
|
||||
pi.nextIdx++
|
||||
}
|
||||
|
||||
func (pi *pendingWritesIterator) Rewind() {
|
||||
pi.nextIdx = 0
|
||||
}
|
||||
|
||||
func (pi *pendingWritesIterator) Seek(key []byte) {
|
||||
key = y.ParseKey(key)
|
||||
pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool {
|
||||
cmp := bytes.Compare(pi.entries[idx].Key, key)
|
||||
if !pi.reversed {
|
||||
return cmp >= 0
|
||||
}
|
||||
return cmp <= 0
|
||||
})
|
||||
}
|
||||
|
||||
func (pi *pendingWritesIterator) Key() []byte {
|
||||
y.AssertTrue(pi.Valid())
|
||||
entry := pi.entries[pi.nextIdx]
|
||||
return y.KeyWithTs(entry.Key, pi.readTs)
|
||||
}
|
||||
|
||||
func (pi *pendingWritesIterator) Value() y.ValueStruct {
|
||||
y.AssertTrue(pi.Valid())
|
||||
entry := pi.entries[pi.nextIdx]
|
||||
return y.ValueStruct{
|
||||
Value: entry.Value,
|
||||
Meta: entry.meta,
|
||||
UserMeta: entry.UserMeta,
|
||||
ExpiresAt: entry.ExpiresAt,
|
||||
Version: pi.readTs,
|
||||
}
|
||||
}
|
||||
|
||||
func (pi *pendingWritesIterator) Valid() bool {
|
||||
return pi.nextIdx < len(pi.entries)
|
||||
}
|
||||
|
||||
func (pi *pendingWritesIterator) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator {
|
||||
if !txn.update || len(txn.pendingWrites) == 0 {
|
||||
return nil
|
||||
}
|
||||
entries := make([]*Entry, 0, len(txn.pendingWrites))
|
||||
for _, e := range txn.pendingWrites {
|
||||
entries = append(entries, e)
|
||||
}
|
||||
// Number of pending writes per transaction shouldn't be too big in general.
|
||||
sort.Slice(entries, func(i, j int) bool {
|
||||
cmp := bytes.Compare(entries[i].Key, entries[j].Key)
|
||||
if !reversed {
|
||||
return cmp < 0
|
||||
}
|
||||
return cmp > 0
|
||||
})
|
||||
return &pendingWritesIterator{
|
||||
readTs: txn.readTs,
|
||||
entries: entries,
|
||||
reversed: reversed,
|
||||
}
|
||||
}
|
||||
|
||||
func (txn *Txn) checkSize(e *Entry) error {
|
||||
count := txn.count + 1
|
||||
// Extra bytes for version in key.
|
||||
size := txn.size + int64(e.estimateSize(txn.db.opt.ValueThreshold)) + 10
|
||||
if count >= txn.db.opt.maxBatchCount || size >= txn.db.opt.maxBatchSize {
|
||||
return ErrTxnTooBig
|
||||
}
|
||||
txn.count, txn.size = count, size
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set adds a key-value pair to the database.
|
||||
//
|
||||
// It will return ErrReadOnlyTxn if update flag was set to false when creating the
|
||||
// transaction.
|
||||
func (txn *Txn) Set(key, val []byte) error {
|
||||
e := &Entry{
|
||||
Key: key,
|
||||
Value: val,
|
||||
}
|
||||
return txn.SetEntry(e)
|
||||
}
|
||||
|
||||
// SetWithMeta adds a key-value pair to the database, along with a metadata
|
||||
// byte. This byte is stored alongside the key, and can be used as an aid to
|
||||
// interpret the value or store other contextual bits corresponding to the
|
||||
// key-value pair.
|
||||
func (txn *Txn) SetWithMeta(key, val []byte, meta byte) error {
|
||||
e := &Entry{Key: key, Value: val, UserMeta: meta}
|
||||
return txn.SetEntry(e)
|
||||
}
|
||||
|
||||
// SetWithTTL adds a key-value pair to the database, along with a time-to-live
|
||||
// (TTL) setting. A key stored with with a TTL would automatically expire after
|
||||
// the time has elapsed , and be eligible for garbage collection.
|
||||
func (txn *Txn) SetWithTTL(key, val []byte, dur time.Duration) error {
|
||||
expire := time.Now().Add(dur).Unix()
|
||||
e := &Entry{Key: key, Value: val, ExpiresAt: uint64(expire)}
|
||||
return txn.SetEntry(e)
|
||||
}
|
||||
|
||||
func (txn *Txn) modify(e *Entry, operation int) error {
|
||||
if !txn.update {
|
||||
return ErrReadOnlyTxn
|
||||
} else if txn.discarded {
|
||||
return ErrDiscardedTxn
|
||||
} else if len(e.Key) == 0 {
|
||||
return ErrEmptyKey
|
||||
} else if len(e.Key) > maxKeySize {
|
||||
return exceedsMaxKeySizeError(e.Key)
|
||||
} else if int64(len(e.Value)) > txn.db.opt.ValueLogFileSize {
|
||||
return exceedsMaxValueSizeError(e.Value, txn.db.opt.ValueLogFileSize)
|
||||
}
|
||||
if err := txn.checkSize(e); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fp := farm.Fingerprint64(e.Key) // Avoid dealing with byte arrays.
|
||||
txn.writes = append(txn.writes, fp)
|
||||
txn.pendingWrites[string(e.Key)] = e
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetEntry takes an Entry struct and adds the key-value pair in the struct, along
|
||||
// with other metadata to the database.
|
||||
func (txn *Txn) SetEntry(e *Entry) error {
|
||||
return txn.modify(e, setItem)
|
||||
}
|
||||
|
||||
// Delete deletes a key. This is done by adding a delete marker for the key at commit timestamp.
|
||||
// Any reads happening before this timestamp would be unaffected. Any reads after this commit would
|
||||
// see the deletion.
|
||||
func (txn *Txn) Delete(key []byte) error {
|
||||
e := &Entry{
|
||||
Key: key,
|
||||
meta: bitDelete,
|
||||
}
|
||||
return txn.modify(e, deleteItem)
|
||||
}
|
||||
|
||||
// Get looks for key and returns corresponding Item.
|
||||
// If key is not found, ErrKeyNotFound is returned.
|
||||
func (txn *Txn) Get(key []byte) (item *Item, rerr error) {
|
||||
if len(key) == 0 {
|
||||
return nil, ErrEmptyKey
|
||||
} else if txn.discarded {
|
||||
return nil, ErrDiscardedTxn
|
||||
}
|
||||
|
||||
item = new(Item)
|
||||
if txn.update {
|
||||
if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key) {
|
||||
if isDeletedOrExpired(e.meta, e.ExpiresAt) {
|
||||
return nil, ErrKeyNotFound
|
||||
}
|
||||
// Fulfill from cache.
|
||||
item.meta = e.meta
|
||||
item.val = e.Value
|
||||
item.userMeta = e.UserMeta
|
||||
item.key = key
|
||||
item.status = prefetched
|
||||
item.version = txn.readTs
|
||||
// We probably don't need to set db on item here.
|
||||
return item, nil
|
||||
}
|
||||
// Only track reads if this is update txn. No need to track read if txn serviced it
|
||||
// internally.
|
||||
fp := farm.Fingerprint64(key)
|
||||
txn.reads = append(txn.reads, fp)
|
||||
}
|
||||
|
||||
seek := y.KeyWithTs(key, txn.readTs)
|
||||
vs, err := txn.db.get(seek)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "DB::Get key: %q", key)
|
||||
}
|
||||
if vs.Value == nil && vs.Meta == 0 {
|
||||
return nil, ErrKeyNotFound
|
||||
}
|
||||
if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
|
||||
return nil, ErrKeyNotFound
|
||||
}
|
||||
|
||||
item.key = key
|
||||
item.version = vs.Version
|
||||
item.meta = vs.Meta
|
||||
item.userMeta = vs.UserMeta
|
||||
item.db = txn.db
|
||||
item.vptr = vs.Value
|
||||
item.txn = txn
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func (txn *Txn) runCallbacks() {
|
||||
for _, cb := range txn.callbacks {
|
||||
cb()
|
||||
}
|
||||
txn.callbacks = nil
|
||||
}
|
||||
|
||||
// Discard discards a created transaction. This method is very important and must be called. Commit
|
||||
// method calls this internally, however, calling this multiple times doesn't cause any issues. So,
|
||||
// this can safely be called via a defer right when transaction is created.
|
||||
//
|
||||
// NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned.
|
||||
func (txn *Txn) Discard() {
|
||||
if txn.discarded { // Avoid a re-run.
|
||||
return
|
||||
}
|
||||
txn.discarded = true
|
||||
txn.runCallbacks()
|
||||
|
||||
if txn.update {
|
||||
txn.db.orc.decrRef()
|
||||
}
|
||||
}
|
||||
|
||||
// Commit commits the transaction, following these steps:
|
||||
//
|
||||
// 1. If there are no writes, return immediately.
|
||||
//
|
||||
// 2. Check if read rows were updated since txn started. If so, return ErrConflict.
|
||||
//
|
||||
// 3. If no conflict, generate a commit timestamp and update written rows' commit ts.
|
||||
//
|
||||
// 4. Batch up all writes, write them to value log and LSM tree.
|
||||
//
|
||||
// 5. If callback is provided, Badger will return immediately after checking
|
||||
// for conflicts. Writes to the database will happen in the background. If
|
||||
// there is a conflict, an error will be returned and the callback will not
|
||||
// run. If there are no conflicts, the callback will be called in the
|
||||
// background upon successful completion of writes or any error during write.
|
||||
//
|
||||
// If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM
|
||||
// tree won't be updated, so there's no need for any rollback.
|
||||
func (txn *Txn) Commit(callback func(error)) error {
|
||||
if txn.commitTs == 0 && txn.db.opt.managedTxns {
|
||||
return ErrManagedTxn
|
||||
}
|
||||
if txn.discarded {
|
||||
return ErrDiscardedTxn
|
||||
}
|
||||
defer txn.Discard()
|
||||
if len(txn.writes) == 0 {
|
||||
return nil // Nothing to do.
|
||||
}
|
||||
|
||||
state := txn.db.orc
|
||||
state.writeLock.Lock()
|
||||
commitTs := state.newCommitTs(txn)
|
||||
if commitTs == 0 {
|
||||
state.writeLock.Unlock()
|
||||
return ErrConflict
|
||||
}
|
||||
|
||||
entries := make([]*Entry, 0, len(txn.pendingWrites)+1)
|
||||
for _, e := range txn.pendingWrites {
|
||||
// Suffix the keys with commit ts, so the key versions are sorted in
|
||||
// descending order of commit timestamp.
|
||||
e.Key = y.KeyWithTs(e.Key, commitTs)
|
||||
e.meta |= bitTxn
|
||||
entries = append(entries, e)
|
||||
}
|
||||
e := &Entry{
|
||||
Key: y.KeyWithTs(txnKey, commitTs),
|
||||
Value: []byte(strconv.FormatUint(commitTs, 10)),
|
||||
meta: bitFinTxn,
|
||||
}
|
||||
entries = append(entries, e)
|
||||
|
||||
req, err := txn.db.sendToWriteCh(entries)
|
||||
state.writeLock.Unlock()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Need to release all locks or writes can get deadlocked.
|
||||
txn.runCallbacks()
|
||||
|
||||
if callback == nil {
|
||||
// If batchSet failed, LSM would not have been updated. So, no need to rollback anything.
|
||||
|
||||
// TODO: What if some of the txns successfully make it to value log, but others fail.
|
||||
// Nothing gets updated to LSM, until a restart happens.
|
||||
defer state.doneCommit(commitTs)
|
||||
return req.Wait()
|
||||
}
|
||||
go func() {
|
||||
err := req.Wait()
|
||||
// Write is complete. Let's call the callback function now.
|
||||
state.doneCommit(commitTs)
|
||||
callback(err)
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewTransaction creates a new transaction. Badger supports concurrent execution of transactions,
|
||||
// providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking
|
||||
// the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by
|
||||
// another transaction.
|
||||
//
|
||||
// For read-only transactions, set update to false. In this mode, we don't track the rows read for
|
||||
// any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead.
|
||||
//
|
||||
// Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and
|
||||
// should only be run serially. It doesn't matter if a transaction is created by one goroutine and
|
||||
// passed down to other, as long as the Txn APIs are called serially.
|
||||
//
|
||||
// When you create a new transaction, it is absolutely essential to call
|
||||
// Discard(). This should be done irrespective of what the update param is set
|
||||
// to. Commit API internally runs Discard, but running it twice wouldn't cause
|
||||
// any issues.
|
||||
//
|
||||
// txn := db.NewTransaction(false)
|
||||
// defer txn.Discard()
|
||||
// // Call various APIs.
|
||||
func (db *DB) NewTransaction(update bool) *Txn {
|
||||
if db.opt.ReadOnly && update {
|
||||
// DB is read-only, force read-only transaction.
|
||||
update = false
|
||||
}
|
||||
|
||||
txn := &Txn{
|
||||
update: update,
|
||||
db: db,
|
||||
readTs: db.orc.readTs(),
|
||||
count: 1, // One extra entry for BitFin.
|
||||
size: int64(len(txnKey) + 10), // Some buffer for the extra entry.
|
||||
}
|
||||
if update {
|
||||
txn.pendingWrites = make(map[string]*Entry)
|
||||
txn.db.orc.addRef()
|
||||
}
|
||||
return txn
|
||||
}
|
||||
|
||||
// View executes a function creating and managing a read-only transaction for the user. Error
|
||||
// returned by the function is relayed by the View method.
|
||||
func (db *DB) View(fn func(txn *Txn) error) error {
|
||||
if db.opt.managedTxns {
|
||||
return ErrManagedTxn
|
||||
}
|
||||
txn := db.NewTransaction(false)
|
||||
defer txn.Discard()
|
||||
|
||||
return fn(txn)
|
||||
}
|
||||
|
||||
// Update executes a function, creating and managing a read-write transaction
|
||||
// for the user. Error returned by the function is relayed by the Update method.
|
||||
func (db *DB) Update(fn func(txn *Txn) error) error {
|
||||
if db.opt.managedTxns {
|
||||
return ErrManagedTxn
|
||||
}
|
||||
txn := db.NewTransaction(true)
|
||||
defer txn.Discard()
|
||||
|
||||
if err := fn(txn); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return txn.Commit(nil)
|
||||
}
|
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package badger
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"math/rand"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/dgraph-io/badger/table"
|
||||
"github.com/dgraph-io/badger/y"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// summary is produced when DB is closed. Currently it is used only for testing.
|
||||
type summary struct {
|
||||
fileIDs map[uint64]bool
|
||||
}
|
||||
|
||||
func (s *levelsController) getSummary() *summary {
|
||||
out := &summary{
|
||||
fileIDs: make(map[uint64]bool),
|
||||
}
|
||||
for _, l := range s.levels {
|
||||
l.getSummary(out)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *levelHandler) getSummary(sum *summary) {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
for _, t := range s.tables {
|
||||
sum.fileIDs[t.ID()] = true
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DB) validate() error { return s.lc.validate() }
|
||||
|
||||
func (s *levelsController) validate() error {
|
||||
for _, l := range s.levels {
|
||||
if err := l.validate(); err != nil {
|
||||
return errors.Wrap(err, "Levels Controller")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check does some sanity check on one level of data or in-memory index.
|
||||
func (s *levelHandler) validate() error {
|
||||
if s.level == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
numTables := len(s.tables)
|
||||
for j := 1; j < numTables; j++ {
|
||||
if j >= len(s.tables) {
|
||||
return errors.Errorf("Level %d, j=%d numTables=%d", s.level, j, numTables)
|
||||
}
|
||||
|
||||
if y.CompareKeys(s.tables[j-1].Biggest(), s.tables[j].Smallest()) >= 0 {
|
||||
return errors.Errorf(
|
||||
"Inter: %q vs %q: level=%d j=%d numTables=%d",
|
||||
string(s.tables[j-1].Biggest()), string(s.tables[j].Smallest()), s.level, j, numTables)
|
||||
}
|
||||
|
||||
if y.CompareKeys(s.tables[j].Smallest(), s.tables[j].Biggest()) > 0 {
|
||||
return errors.Errorf(
|
||||
"Intra: %q vs %q: level=%d j=%d numTables=%d",
|
||||
s.tables[j].Smallest(), s.tables[j].Biggest(), s.level, j, numTables)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// func (s *KV) debugPrintMore() { s.lc.debugPrintMore() }
|
||||
|
||||
// // debugPrintMore shows key ranges of each level.
|
||||
// func (s *levelsController) debugPrintMore() {
|
||||
// s.Lock()
|
||||
// defer s.Unlock()
|
||||
// for i := 0; i < s.kv.opt.MaxLevels; i++ {
|
||||
// s.levels[i].debugPrintMore()
|
||||
// }
|
||||
// }
|
||||
|
||||
// func (s *levelHandler) debugPrintMore() {
|
||||
// s.RLock()
|
||||
// defer s.RUnlock()
|
||||
// s.elog.Printf("Level %d:", s.level)
|
||||
// for _, t := range s.tables {
|
||||
// y.Printf(" [%s, %s]", t.Smallest(), t.Biggest())
|
||||
// }
|
||||
// y.Printf("\n")
|
||||
// }
|
||||
|
||||
// reserveFileID reserves a unique file id.
|
||||
func (s *levelsController) reserveFileID() uint64 {
|
||||
id := atomic.AddUint64(&s.nextFileID, 1)
|
||||
return id - 1
|
||||
}
|
||||
|
||||
func getIDMap(dir string) map[uint64]struct{} {
|
||||
fileInfos, err := ioutil.ReadDir(dir)
|
||||
y.Check(err)
|
||||
idMap := make(map[uint64]struct{})
|
||||
for _, info := range fileInfos {
|
||||
if info.IsDir() {
|
||||
continue
|
||||
}
|
||||
fileID, ok := table.ParseFileID(info.Name())
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
idMap[fileID] = struct{}{}
|
||||
}
|
||||
return idMap
|
||||
}
|
||||
|
||||
func init() {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
// This file contains some functions for error handling. Note that we are moving
|
||||
// towards using x.Trace, i.e., rpc tracing using net/tracer. But for now, these
|
||||
// functions are useful for simple checks logged on one machine.
|
||||
// Some common use cases are:
|
||||
// (1) You receive an error from external lib, and would like to check/log fatal.
|
||||
// For this, use x.Check, x.Checkf. These will check for err != nil, which is
|
||||
// more common in Go. If you want to check for boolean being true, use
|
||||
// x.Assert, x.Assertf.
|
||||
// (2) You receive an error from external lib, and would like to pass on with some
|
||||
// stack trace information. In this case, use x.Wrap or x.Wrapf.
|
||||
// (3) You want to generate a new error with stack trace info. Use x.Errorf.
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
var debugMode = true
|
||||
|
||||
// Check logs fatal if err != nil.
|
||||
func Check(err error) {
|
||||
if err != nil {
|
||||
log.Fatalf("%+v", Wrap(err))
|
||||
}
|
||||
}
|
||||
|
||||
// Check2 acts as convenience wrapper around Check, using the 2nd argument as error.
|
||||
func Check2(_ interface{}, err error) {
|
||||
Check(err)
|
||||
}
|
||||
|
||||
// AssertTrue asserts that b is true. Otherwise, it would log fatal.
|
||||
func AssertTrue(b bool) {
|
||||
if !b {
|
||||
log.Fatalf("%+v", errors.Errorf("Assert failed"))
|
||||
}
|
||||
}
|
||||
|
||||
// AssertTruef is AssertTrue with extra info.
|
||||
func AssertTruef(b bool, format string, args ...interface{}) {
|
||||
if !b {
|
||||
log.Fatalf("%+v", errors.Errorf(format, args...))
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap wraps errors from external lib.
|
||||
func Wrap(err error) error {
|
||||
if !debugMode {
|
||||
return err
|
||||
}
|
||||
return errors.Wrap(err, "")
|
||||
}
|
||||
|
||||
// Wrapf is Wrap with extra info.
|
||||
func Wrapf(err error, format string, args ...interface{}) error {
|
||||
if !debugMode {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf(format+" error: %+v", append(args, err)...)
|
||||
}
|
||||
return errors.Wrapf(err, format, args...)
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
// +build !dragonfly,!freebsd,!windows
|
||||
|
||||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
import "golang.org/x/sys/unix"
|
||||
|
||||
func init() {
|
||||
datasyncFileFlag = unix.O_DSYNC
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
// +build dragonfly freebsd windows
|
||||
|
||||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
import "syscall"
|
||||
|
||||
func init() {
|
||||
datasyncFileFlag = syscall.O_SYNC
|
||||
}
|
|
@ -0,0 +1,264 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"container/heap"
|
||||
"encoding/binary"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// ValueStruct represents the value info that can be associated with a key, but also the internal
|
||||
// Meta field.
|
||||
type ValueStruct struct {
|
||||
Meta byte
|
||||
UserMeta byte
|
||||
ExpiresAt uint64
|
||||
Value []byte
|
||||
|
||||
Version uint64 // This field is not serialized. Only for internal usage.
|
||||
}
|
||||
|
||||
func sizeVarint(x uint64) (n int) {
|
||||
for {
|
||||
n++
|
||||
x >>= 7
|
||||
if x == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// EncodedSize is the size of the ValueStruct when encoded
|
||||
func (v *ValueStruct) EncodedSize() uint16 {
|
||||
sz := len(v.Value) + 2 // meta, usermeta.
|
||||
if v.ExpiresAt == 0 {
|
||||
return uint16(sz + 1)
|
||||
}
|
||||
|
||||
enc := sizeVarint(v.ExpiresAt)
|
||||
return uint16(sz + enc)
|
||||
}
|
||||
|
||||
// Decode uses the length of the slice to infer the length of the Value field.
|
||||
func (v *ValueStruct) Decode(b []byte) {
|
||||
v.Meta = b[0]
|
||||
v.UserMeta = b[1]
|
||||
var sz int
|
||||
v.ExpiresAt, sz = binary.Uvarint(b[2:])
|
||||
v.Value = b[2+sz:]
|
||||
}
|
||||
|
||||
// Encode expects a slice of length at least v.EncodedSize().
|
||||
func (v *ValueStruct) Encode(b []byte) {
|
||||
b[0] = v.Meta
|
||||
b[1] = v.UserMeta
|
||||
sz := binary.PutUvarint(b[2:], v.ExpiresAt)
|
||||
copy(b[2+sz:], v.Value)
|
||||
}
|
||||
|
||||
// EncodeTo should be kept in sync with the Encode function above. The reason
|
||||
// this function exists is to avoid creating byte arrays per key-value pair in
|
||||
// table/builder.go.
|
||||
func (v *ValueStruct) EncodeTo(buf *bytes.Buffer) {
|
||||
buf.WriteByte(v.Meta)
|
||||
buf.WriteByte(v.UserMeta)
|
||||
var enc [binary.MaxVarintLen64]byte
|
||||
sz := binary.PutUvarint(enc[:], v.ExpiresAt)
|
||||
buf.Write(enc[:sz])
|
||||
buf.Write(v.Value)
|
||||
}
|
||||
|
||||
// Iterator is an interface for a basic iterator.
|
||||
type Iterator interface {
|
||||
Next()
|
||||
Rewind()
|
||||
Seek(key []byte)
|
||||
Key() []byte
|
||||
Value() ValueStruct
|
||||
Valid() bool
|
||||
|
||||
// All iterators should be closed so that file garbage collection works.
|
||||
Close() error
|
||||
}
|
||||
|
||||
type elem struct {
|
||||
itr Iterator
|
||||
nice int
|
||||
reversed bool
|
||||
}
|
||||
|
||||
type elemHeap []*elem
|
||||
|
||||
func (eh elemHeap) Len() int { return len(eh) }
|
||||
func (eh elemHeap) Swap(i, j int) { eh[i], eh[j] = eh[j], eh[i] }
|
||||
func (eh *elemHeap) Push(x interface{}) { *eh = append(*eh, x.(*elem)) }
|
||||
func (eh *elemHeap) Pop() interface{} {
|
||||
// Remove the last element, because Go has already swapped 0th elem <-> last.
|
||||
old := *eh
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*eh = old[0 : n-1]
|
||||
return x
|
||||
}
|
||||
func (eh elemHeap) Less(i, j int) bool {
|
||||
cmp := CompareKeys(eh[i].itr.Key(), eh[j].itr.Key())
|
||||
if cmp < 0 {
|
||||
return !eh[i].reversed
|
||||
}
|
||||
if cmp > 0 {
|
||||
return eh[i].reversed
|
||||
}
|
||||
// The keys are equal. In this case, lower nice take precedence. This is important.
|
||||
return eh[i].nice < eh[j].nice
|
||||
}
|
||||
|
||||
// MergeIterator merges multiple iterators.
|
||||
// NOTE: MergeIterator owns the array of iterators and is responsible for closing them.
|
||||
type MergeIterator struct {
|
||||
h elemHeap
|
||||
curKey []byte
|
||||
reversed bool
|
||||
|
||||
all []Iterator
|
||||
}
|
||||
|
||||
// NewMergeIterator returns a new MergeIterator from a list of Iterators.
|
||||
func NewMergeIterator(iters []Iterator, reversed bool) *MergeIterator {
|
||||
m := &MergeIterator{all: iters, reversed: reversed}
|
||||
m.h = make(elemHeap, 0, len(iters))
|
||||
m.initHeap()
|
||||
return m
|
||||
}
|
||||
|
||||
func (s *MergeIterator) storeKey(smallest Iterator) {
|
||||
if cap(s.curKey) < len(smallest.Key()) {
|
||||
s.curKey = make([]byte, 2*len(smallest.Key()))
|
||||
}
|
||||
s.curKey = s.curKey[:len(smallest.Key())]
|
||||
copy(s.curKey, smallest.Key())
|
||||
}
|
||||
|
||||
// initHeap checks all iterators and initializes our heap and array of keys.
|
||||
// Whenever we reverse direction, we need to run this.
|
||||
func (s *MergeIterator) initHeap() {
|
||||
s.h = s.h[:0]
|
||||
for idx, itr := range s.all {
|
||||
if !itr.Valid() {
|
||||
continue
|
||||
}
|
||||
e := &elem{itr: itr, nice: idx, reversed: s.reversed}
|
||||
s.h = append(s.h, e)
|
||||
}
|
||||
heap.Init(&s.h)
|
||||
for len(s.h) > 0 {
|
||||
it := s.h[0].itr
|
||||
if it == nil || !it.Valid() {
|
||||
heap.Pop(&s.h)
|
||||
continue
|
||||
}
|
||||
s.storeKey(s.h[0].itr)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Valid returns whether the MergeIterator is at a valid element.
|
||||
func (s *MergeIterator) Valid() bool {
|
||||
if s == nil {
|
||||
return false
|
||||
}
|
||||
if len(s.h) == 0 {
|
||||
return false
|
||||
}
|
||||
return s.h[0].itr.Valid()
|
||||
}
|
||||
|
||||
// Key returns the key associated with the current iterator
|
||||
func (s *MergeIterator) Key() []byte {
|
||||
if len(s.h) == 0 {
|
||||
return nil
|
||||
}
|
||||
return s.h[0].itr.Key()
|
||||
}
|
||||
|
||||
// Value returns the value associated with the iterator.
|
||||
func (s *MergeIterator) Value() ValueStruct {
|
||||
if len(s.h) == 0 {
|
||||
return ValueStruct{}
|
||||
}
|
||||
return s.h[0].itr.Value()
|
||||
}
|
||||
|
||||
// Next returns the next element. If it is the same as the current key, ignore it.
|
||||
func (s *MergeIterator) Next() {
|
||||
if len(s.h) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
smallest := s.h[0].itr
|
||||
smallest.Next()
|
||||
|
||||
for len(s.h) > 0 {
|
||||
smallest = s.h[0].itr
|
||||
if !smallest.Valid() {
|
||||
heap.Pop(&s.h)
|
||||
continue
|
||||
}
|
||||
|
||||
heap.Fix(&s.h, 0)
|
||||
smallest = s.h[0].itr
|
||||
if smallest.Valid() {
|
||||
if !bytes.Equal(smallest.Key(), s.curKey) {
|
||||
break
|
||||
}
|
||||
smallest.Next()
|
||||
}
|
||||
}
|
||||
if !smallest.Valid() {
|
||||
return
|
||||
}
|
||||
s.storeKey(smallest)
|
||||
}
|
||||
|
||||
// Rewind seeks to first element (or last element for reverse iterator).
|
||||
func (s *MergeIterator) Rewind() {
|
||||
for _, itr := range s.all {
|
||||
itr.Rewind()
|
||||
}
|
||||
s.initHeap()
|
||||
}
|
||||
|
||||
// Seek brings us to element with key >= given key.
|
||||
func (s *MergeIterator) Seek(key []byte) {
|
||||
for _, itr := range s.all {
|
||||
itr.Seek(key)
|
||||
}
|
||||
s.initHeap()
|
||||
}
|
||||
|
||||
// Close implements y.Iterator
|
||||
func (s *MergeIterator) Close() error {
|
||||
for _, itr := range s.all {
|
||||
if err := itr.Close(); err != nil {
|
||||
return errors.Wrap(err, "MergeIterator")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Copyright (C) 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
import "expvar"
|
||||
|
||||
var (
|
||||
// LSMSize has size of the LSM in bytes
|
||||
LSMSize *expvar.Map
|
||||
// VlogSize has size of the value log in bytes
|
||||
VlogSize *expvar.Map
|
||||
// PendingWrites tracks the number of pending writes.
|
||||
PendingWrites *expvar.Map
|
||||
|
||||
// These are cumulative
|
||||
|
||||
// NumReads has cumulative number of reads
|
||||
NumReads *expvar.Int
|
||||
// NumWrites has cumulative number of writes
|
||||
NumWrites *expvar.Int
|
||||
// NumBytesRead has cumulative number of bytes read
|
||||
NumBytesRead *expvar.Int
|
||||
// NumBytesWritten has cumulative number of bytes written
|
||||
NumBytesWritten *expvar.Int
|
||||
// NumLSMGets is number of LMS gets
|
||||
NumLSMGets *expvar.Map
|
||||
// NumLSMBloomHits is number of LMS bloom hits
|
||||
NumLSMBloomHits *expvar.Map
|
||||
// NumGets is number of gets
|
||||
NumGets *expvar.Int
|
||||
// NumPuts is number of puts
|
||||
NumPuts *expvar.Int
|
||||
// NumBlockedPuts is number of blocked puts
|
||||
NumBlockedPuts *expvar.Int
|
||||
// NumMemtableGets is number of memtable gets
|
||||
NumMemtableGets *expvar.Int
|
||||
)
|
||||
|
||||
// These variables are global and have cumulative values for all kv stores.
|
||||
func init() {
|
||||
NumReads = expvar.NewInt("badger_disk_reads_total")
|
||||
NumWrites = expvar.NewInt("badger_disk_writes_total")
|
||||
NumBytesRead = expvar.NewInt("badger_read_bytes")
|
||||
NumBytesWritten = expvar.NewInt("badger_written_bytes")
|
||||
NumLSMGets = expvar.NewMap("badger_lsm_level_gets_total")
|
||||
NumLSMBloomHits = expvar.NewMap("badger_lsm_bloom_hits_total")
|
||||
NumGets = expvar.NewInt("badger_gets_total")
|
||||
NumPuts = expvar.NewInt("badger_puts_total")
|
||||
NumBlockedPuts = expvar.NewInt("badger_blocked_puts_total")
|
||||
NumMemtableGets = expvar.NewInt("badger_memtable_gets_total")
|
||||
LSMSize = expvar.NewMap("badger_lsm_size_bytes")
|
||||
VlogSize = expvar.NewMap("badger_vlog_size_bytes")
|
||||
PendingWrites = expvar.NewMap("badger_pending_writes_total")
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
// +build !windows
|
||||
|
||||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// Mmap uses the mmap system call to memory-map a file. If writable is true,
|
||||
// memory protection of the pages is set so that they may be written to as well.
|
||||
func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) {
|
||||
mtype := unix.PROT_READ
|
||||
if writable {
|
||||
mtype |= unix.PROT_WRITE
|
||||
}
|
||||
return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED)
|
||||
}
|
||||
|
||||
// Munmap unmaps a previously mapped slice.
|
||||
func Munmap(b []byte) error {
|
||||
return unix.Munmap(b)
|
||||
}
|
||||
|
||||
// Madvise uses the madvise system call to give advise about the use of memory
|
||||
// when using a slice that is memory-mapped to a file. Set the readahead flag to
|
||||
// false if page references are expected in random order.
|
||||
func Madvise(b []byte, readahead bool) error {
|
||||
flags := unix.MADV_NORMAL
|
||||
if !readahead {
|
||||
flags = unix.MADV_RANDOM
|
||||
}
|
||||
return madvise(b, flags)
|
||||
}
|
||||
|
||||
// This is required because the unix package does not support the madvise system call on OS X.
|
||||
func madvise(b []byte, advice int) (err error) {
|
||||
_, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])),
|
||||
uintptr(len(b)), uintptr(advice))
|
||||
if e1 != 0 {
|
||||
err = e1
|
||||
}
|
||||
return
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
// +build windows
|
||||
|
||||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
func Mmap(fd *os.File, write bool, size int64) ([]byte, error) {
|
||||
protect := syscall.PAGE_READONLY
|
||||
access := syscall.FILE_MAP_READ
|
||||
|
||||
if write {
|
||||
protect = syscall.PAGE_READWRITE
|
||||
access = syscall.FILE_MAP_WRITE
|
||||
}
|
||||
fi, err := fd.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Truncate the database to the size of the mmap.
|
||||
if fi.Size() < size {
|
||||
if err := fd.Truncate(size); err != nil {
|
||||
return nil, fmt.Errorf("truncate: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Open a file mapping handle.
|
||||
sizelo := uint32(size >> 32)
|
||||
sizehi := uint32(size) & 0xffffffff
|
||||
|
||||
handler, err := syscall.CreateFileMapping(syscall.Handle(fd.Fd()), nil,
|
||||
uint32(protect), sizelo, sizehi, nil)
|
||||
if err != nil {
|
||||
return nil, os.NewSyscallError("CreateFileMapping", err)
|
||||
}
|
||||
|
||||
// Create the memory map.
|
||||
addr, err := syscall.MapViewOfFile(handler, uint32(access), 0, 0, uintptr(size))
|
||||
if addr == 0 {
|
||||
return nil, os.NewSyscallError("MapViewOfFile", err)
|
||||
}
|
||||
|
||||
// Close mapping handle.
|
||||
if err := syscall.CloseHandle(syscall.Handle(handler)); err != nil {
|
||||
return nil, os.NewSyscallError("CloseHandle", err)
|
||||
}
|
||||
|
||||
// Slice memory layout
|
||||
// Copied this snippet from golang/sys package
|
||||
var sl = struct {
|
||||
addr uintptr
|
||||
len int
|
||||
cap int
|
||||
}{addr, int(size), int(size)}
|
||||
|
||||
// Use unsafe to turn sl into a []byte.
|
||||
data := *(*[]byte)(unsafe.Pointer(&sl))
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func Munmap(b []byte) error {
|
||||
return syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&b[0])))
|
||||
}
|
||||
|
||||
func Madvise(b []byte, readahead bool) error {
|
||||
// Do Nothing. We don’t care about this setting on Windows
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,209 @@
|
|||
/*
|
||||
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package y
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"hash/crc32"
|
||||
"math"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// ErrEOF indicates an end of file when trying to read from a memory mapped file
|
||||
// and encountering the end of slice.
|
||||
var ErrEOF = errors.New("End of mapped region")
|
||||
|
||||
const (
|
||||
// Sync indicates that O_DSYNC should be set on the underlying file,
|
||||
// ensuring that data writes do not return until the data is flushed
|
||||
// to disk.
|
||||
Sync = 1 << iota
|
||||
// ReadOnly opens the underlying file on a read-only basis.
|
||||
ReadOnly
|
||||
)
|
||||
|
||||
var (
|
||||
// This is O_DSYNC (datasync) on platforms that support it -- see file_unix.go
|
||||
datasyncFileFlag = 0x0
|
||||
|
||||
// CastagnoliCrcTable is a CRC32 polynomial table
|
||||
CastagnoliCrcTable = crc32.MakeTable(crc32.Castagnoli)
|
||||
)
|
||||
|
||||
// OpenExistingFile opens an existing file, errors if it doesn't exist.
|
||||
func OpenExistingFile(filename string, flags uint32) (*os.File, error) {
|
||||
openFlags := os.O_RDWR
|
||||
if flags&ReadOnly != 0 {
|
||||
openFlags = os.O_RDONLY
|
||||
}
|
||||
|
||||
if flags&Sync != 0 {
|
||||
openFlags |= datasyncFileFlag
|
||||
}
|
||||
return os.OpenFile(filename, openFlags, 0)
|
||||
}
|
||||
|
||||
// CreateSyncedFile creates a new file (using O_EXCL), errors if it already existed.
|
||||
func CreateSyncedFile(filename string, sync bool) (*os.File, error) {
|
||||
flags := os.O_RDWR | os.O_CREATE | os.O_EXCL
|
||||
if sync {
|
||||
flags |= datasyncFileFlag
|
||||
}
|
||||
return os.OpenFile(filename, flags, 0666)
|
||||
}
|
||||
|
||||
// OpenSyncedFile creates the file if one doesn't exist.
|
||||
func OpenSyncedFile(filename string, sync bool) (*os.File, error) {
|
||||
flags := os.O_RDWR | os.O_CREATE
|
||||
if sync {
|
||||
flags |= datasyncFileFlag
|
||||
}
|
||||
return os.OpenFile(filename, flags, 0666)
|
||||
}
|
||||
|
||||
// OpenTruncFile opens the file with O_RDWR | O_CREATE | O_TRUNC
|
||||
func OpenTruncFile(filename string, sync bool) (*os.File, error) {
|
||||
flags := os.O_RDWR | os.O_CREATE | os.O_TRUNC
|
||||
if sync {
|
||||
flags |= datasyncFileFlag
|
||||
}
|
||||
return os.OpenFile(filename, flags, 0666)
|
||||
}
|
||||
|
||||
// SafeCopy does append(a[:0], src...).
|
||||
func SafeCopy(a []byte, src []byte) []byte {
|
||||
return append(a[:0], src...)
|
||||
}
|
||||
|
||||
// Copy copies a byte slice and returns the copied slice.
|
||||
func Copy(a []byte) []byte {
|
||||
b := make([]byte, len(a))
|
||||
copy(b, a)
|
||||
return b
|
||||
}
|
||||
|
||||
// KeyWithTs generates a new key by appending ts to key.
|
||||
func KeyWithTs(key []byte, ts uint64) []byte {
|
||||
out := make([]byte, len(key)+8)
|
||||
copy(out, key)
|
||||
binary.BigEndian.PutUint64(out[len(key):], math.MaxUint64-ts)
|
||||
return out
|
||||
}
|
||||
|
||||
// ParseTs parses the timestamp from the key bytes.
|
||||
func ParseTs(key []byte) uint64 {
|
||||
if len(key) <= 8 {
|
||||
return 0
|
||||
}
|
||||
return math.MaxUint64 - binary.BigEndian.Uint64(key[len(key)-8:])
|
||||
}
|
||||
|
||||
// CompareKeys checks the key without timestamp and checks the timestamp if keyNoTs
|
||||
// is same.
|
||||
// a<timestamp> would be sorted higher than aa<timestamp> if we use bytes.compare
|
||||
// All keys should have timestamp.
|
||||
func CompareKeys(key1 []byte, key2 []byte) int {
|
||||
AssertTrue(len(key1) > 8 && len(key2) > 8)
|
||||
if cmp := bytes.Compare(key1[:len(key1)-8], key2[:len(key2)-8]); cmp != 0 {
|
||||
return cmp
|
||||
}
|
||||
return bytes.Compare(key1[len(key1)-8:], key2[len(key2)-8:])
|
||||
}
|
||||
|
||||
// ParseKey parses the actual key from the key bytes.
|
||||
func ParseKey(key []byte) []byte {
|
||||
if key == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
AssertTruef(len(key) > 8, "key=%q", key)
|
||||
return key[:len(key)-8]
|
||||
}
|
||||
|
||||
// SameKey checks for key equality ignoring the version timestamp suffix.
|
||||
func SameKey(src, dst []byte) bool {
|
||||
if len(src) != len(dst) {
|
||||
return false
|
||||
}
|
||||
return bytes.Equal(ParseKey(src), ParseKey(dst))
|
||||
}
|
||||
|
||||
// Slice holds a reusable buf, will reallocate if you request a larger size than ever before.
|
||||
// One problem is with n distinct sizes in random order it'll reallocate log(n) times.
|
||||
type Slice struct {
|
||||
buf []byte
|
||||
}
|
||||
|
||||
// Resize reuses the Slice's buffer (or makes a new one) and returns a slice in that buffer of
|
||||
// length sz.
|
||||
func (s *Slice) Resize(sz int) []byte {
|
||||
if cap(s.buf) < sz {
|
||||
s.buf = make([]byte, sz)
|
||||
}
|
||||
return s.buf[0:sz]
|
||||
}
|
||||
|
||||
// Closer holds the two things we need to close a goroutine and wait for it to finish: a chan
|
||||
// to tell the goroutine to shut down, and a WaitGroup with which to wait for it to finish shutting
|
||||
// down.
|
||||
type Closer struct {
|
||||
closed chan struct{}
|
||||
waiting sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewCloser constructs a new Closer, with an initial count on the WaitGroup.
|
||||
func NewCloser(initial int) *Closer {
|
||||
ret := &Closer{closed: make(chan struct{})}
|
||||
ret.waiting.Add(initial)
|
||||
return ret
|
||||
}
|
||||
|
||||
// AddRunning Add()'s delta to the WaitGroup.
|
||||
func (lc *Closer) AddRunning(delta int) {
|
||||
lc.waiting.Add(delta)
|
||||
}
|
||||
|
||||
// Signal signals the HasBeenClosed signal.
|
||||
func (lc *Closer) Signal() {
|
||||
close(lc.closed)
|
||||
}
|
||||
|
||||
// HasBeenClosed gets signaled when Signal() is called.
|
||||
func (lc *Closer) HasBeenClosed() <-chan struct{} {
|
||||
return lc.closed
|
||||
}
|
||||
|
||||
// Done calls Done() on the WaitGroup.
|
||||
func (lc *Closer) Done() {
|
||||
lc.waiting.Done()
|
||||
}
|
||||
|
||||
// Wait waits on the WaitGroup. (It waits for NewCloser's initial value, AddRunning, and Done
|
||||
// calls to balance out.)
|
||||
func (lc *Closer) Wait() {
|
||||
lc.waiting.Wait()
|
||||
}
|
||||
|
||||
// SignalAndWait calls Signal(), then Wait().
|
||||
func (lc *Closer) SignalAndWait() {
|
||||
lc.Signal()
|
||||
lc.Wait()
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
As this is a highly derivative work, I have placed it under the same license as the original implementation:
|
||||
|
||||
Copyright (c) 2014-2017 Damian Gryski
|
||||
Copyright (c) 2016-2017 Nicola Asuni - Tecnick.com
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
|
|
@ -0,0 +1,203 @@
|
|||
# MAKEFILE
|
||||
#
|
||||
# @author Nicola Asuni <info@tecnick.com>
|
||||
# @link https://github.com/dgryski/go-farm
|
||||
#
|
||||
# This file is intended to be executed in a Linux-compatible system.
|
||||
# It also assumes that the project has been cloned in the right path under GOPATH:
|
||||
# $GOPATH/src/github.com/dgryski/go-farm
|
||||
#
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
# List special make targets that are not associated with files
|
||||
.PHONY: help all test format fmtcheck vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan qa deps clean nuke
|
||||
|
||||
# Use bash as shell (Note: Ubuntu now uses dash which doesn't support PIPESTATUS).
|
||||
SHELL=/bin/bash
|
||||
|
||||
# CVS path (path to the parent dir containing the project)
|
||||
CVSPATH=github.com/dgryski
|
||||
|
||||
# Project owner
|
||||
OWNER=dgryski
|
||||
|
||||
# Project vendor
|
||||
VENDOR=dgryski
|
||||
|
||||
# Project name
|
||||
PROJECT=go-farm
|
||||
|
||||
# Project version
|
||||
VERSION=$(shell cat VERSION)
|
||||
|
||||
# Name of RPM or DEB package
|
||||
PKGNAME=${VENDOR}-${PROJECT}
|
||||
|
||||
# Current directory
|
||||
CURRENTDIR=$(shell pwd)
|
||||
|
||||
# GO lang path
|
||||
ifneq ($(GOPATH),)
|
||||
ifeq ($(findstring $(GOPATH),$(CURRENTDIR)),)
|
||||
# the defined GOPATH is not valid
|
||||
GOPATH=
|
||||
endif
|
||||
endif
|
||||
ifeq ($(GOPATH),)
|
||||
# extract the GOPATH
|
||||
GOPATH=$(firstword $(subst /src/, ,$(CURRENTDIR)))
|
||||
endif
|
||||
|
||||
# --- MAKE TARGETS ---
|
||||
|
||||
# Display general help about this command
|
||||
help:
|
||||
@echo ""
|
||||
@echo "$(PROJECT) Makefile."
|
||||
@echo "GOPATH=$(GOPATH)"
|
||||
@echo "The following commands are available:"
|
||||
@echo ""
|
||||
@echo " make qa : Run all the tests"
|
||||
@echo " make test : Run the unit tests"
|
||||
@echo ""
|
||||
@echo " make format : Format the source code"
|
||||
@echo " make fmtcheck : Check if the source code has been formatted"
|
||||
@echo " make vet : Check for suspicious constructs"
|
||||
@echo " make lint : Check for style errors"
|
||||
@echo " make coverage : Generate the coverage report"
|
||||
@echo " make cyclo : Generate the cyclomatic complexity report"
|
||||
@echo " make ineffassign : Detect ineffectual assignments"
|
||||
@echo " make misspell : Detect commonly misspelled words in source files"
|
||||
@echo " make structcheck : Find unused struct fields"
|
||||
@echo " make varcheck : Find unused global variables and constants"
|
||||
@echo " make errcheck : Check that error return values are used"
|
||||
@echo " make gosimple : Suggest code simplifications"
|
||||
@echo " make astscan : GO AST scanner"
|
||||
@echo ""
|
||||
@echo " make docs : Generate source code documentation"
|
||||
@echo ""
|
||||
@echo " make deps : Get the dependencies"
|
||||
@echo " make clean : Remove any build artifact"
|
||||
@echo " make nuke : Deletes any intermediate file"
|
||||
@echo ""
|
||||
|
||||
|
||||
# Alias for help target
|
||||
all: help
|
||||
|
||||
# Run the unit tests
|
||||
test:
|
||||
@mkdir -p target/test
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) \
|
||||
go test \
|
||||
-covermode=atomic \
|
||||
-bench=. \
|
||||
-race \
|
||||
-cpuprofile=target/report/cpu.out \
|
||||
-memprofile=target/report/mem.out \
|
||||
-mutexprofile=target/report/mutex.out \
|
||||
-coverprofile=target/report/coverage.out \
|
||||
-v ./... | \
|
||||
tee >(PATH=$(GOPATH)/bin:$(PATH) go-junit-report > target/test/report.xml); \
|
||||
test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Format the source code
|
||||
format:
|
||||
@find . -type f -name "*.go" -exec gofmt -s -w {} \;
|
||||
|
||||
# Check if the source code has been formatted
|
||||
fmtcheck:
|
||||
@mkdir -p target
|
||||
@find . -type f -name "*.go" -exec gofmt -s -d {} \; | tee target/format.diff
|
||||
@test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; }
|
||||
|
||||
# Check for syntax errors
|
||||
vet:
|
||||
GOPATH=$(GOPATH) go vet .
|
||||
|
||||
# Check for style errors
|
||||
lint:
|
||||
GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint .
|
||||
|
||||
# Generate the coverage report
|
||||
coverage:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) \
|
||||
go tool cover -html=target/report/coverage.out -o target/report/coverage.html
|
||||
|
||||
# Report cyclomatic complexity
|
||||
cyclo:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) gocyclo -avg ./ | tee target/report/cyclo.txt ; test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Detect ineffectual assignments
|
||||
ineffassign:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) ineffassign ./ | tee target/report/ineffassign.txt ; test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Detect commonly misspelled words in source files
|
||||
misspell:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) misspell -error ./ | tee target/report/misspell.txt ; test $${PIPESTATUS[0]} -eq 0
|
||||
|
||||
# Find unused struct fields
|
||||
structcheck:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) structcheck -a ./ | tee target/report/structcheck.txt
|
||||
|
||||
# Find unused global variables and constants
|
||||
varcheck:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) varcheck -e ./ | tee target/report/varcheck.txt
|
||||
|
||||
# Check that error return values are used
|
||||
errcheck:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) errcheck ./ | tee target/report/errcheck.txt
|
||||
|
||||
# Suggest code simplifications
|
||||
gosimple:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) gosimple ./ | tee target/report/gosimple.txt
|
||||
|
||||
# AST scanner
|
||||
astscan:
|
||||
@mkdir -p target/report
|
||||
GOPATH=$(GOPATH) gas .//*.go | tee target/report/astscan.txt
|
||||
|
||||
# Generate source docs
|
||||
docs:
|
||||
@mkdir -p target/docs
|
||||
nohup sh -c 'GOPATH=$(GOPATH) godoc -http=127.0.0.1:6060' > target/godoc_server.log 2>&1 &
|
||||
wget --directory-prefix=target/docs/ --execute robots=off --retry-connrefused --recursive --no-parent --adjust-extension --page-requisites --convert-links http://127.0.0.1:6060/pkg/github.com/${VENDOR}/${PROJECT}/ ; kill -9 `lsof -ti :6060`
|
||||
@echo '<html><head><meta http-equiv="refresh" content="0;./127.0.0.1:6060/pkg/'${CVSPATH}'/'${PROJECT}'/index.html"/></head><a href="./127.0.0.1:6060/pkg/'${CVSPATH}'/'${PROJECT}'/index.html">'${PKGNAME}' Documentation ...</a></html>' > target/docs/index.html
|
||||
|
||||
# Alias to run all quality-assurance checks
|
||||
qa: fmtcheck test vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan
|
||||
|
||||
# --- INSTALL ---
|
||||
|
||||
# Get the dependencies
|
||||
deps:
|
||||
GOPATH=$(GOPATH) go get ./...
|
||||
GOPATH=$(GOPATH) go get github.com/golang/lint/golint
|
||||
GOPATH=$(GOPATH) go get github.com/jstemmer/go-junit-report
|
||||
GOPATH=$(GOPATH) go get github.com/axw/gocov/gocov
|
||||
GOPATH=$(GOPATH) go get github.com/fzipp/gocyclo
|
||||
GOPATH=$(GOPATH) go get github.com/gordonklaus/ineffassign
|
||||
GOPATH=$(GOPATH) go get github.com/client9/misspell/cmd/misspell
|
||||
GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/structcheck
|
||||
GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/varcheck
|
||||
GOPATH=$(GOPATH) go get github.com/kisielk/errcheck
|
||||
GOPATH=$(GOPATH) go get honnef.co/go/tools/cmd/gosimple
|
||||
GOPATH=$(GOPATH) go get github.com/GoASTScanner/gas
|
||||
|
||||
# Remove any build artifact
|
||||
clean:
|
||||
GOPATH=$(GOPATH) go clean ./...
|
||||
|
||||
# Deletes any intermediate file
|
||||
nuke:
|
||||
rm -rf ./target
|
||||
GOPATH=$(GOPATH) go clean -i ./...
|
|
@ -0,0 +1,41 @@
|
|||
# go-farm
|
||||
|
||||
*Google's FarmHash hash functions implemented in Go*
|
||||
|
||||
[![Master Branch](https://img.shields.io/badge/-master:-gray.svg)](https://github.com/dgryski/go-farm/tree/master)
|
||||
[![Master Build Status](https://secure.travis-ci.org/dgryski/go-farm.png?branch=master)](https://travis-ci.org/dgryski/go-farm?branch=master)
|
||||
[![Master Coverage Status](https://coveralls.io/repos/dgryski/go-farm/badge.svg?branch=master&service=github)](https://coveralls.io/github/dgryski/go-farm?branch=master)
|
||||
[![Go Report Card](https://goreportcard.com/badge/github.com/dgryski/go-farm)](https://goreportcard.com/report/github.com/dgryski/go-farm)
|
||||
[![GoDoc](https://godoc.org/github.com/dgryski/go-farm?status.svg)](http://godoc.org/github.com/dgryski/go-farm)
|
||||
|
||||
## Description
|
||||
|
||||
FarmHash, a family of hash functions.
|
||||
|
||||
This is a (mechanical) translation of the non-SSE4/non-AESNI hash functions from Google's FarmHash (https://github.com/google/farmhash).
|
||||
|
||||
|
||||
FarmHash provides hash functions for strings and other data.
|
||||
The functions mix the input bits thoroughly but are not suitable for cryptography.
|
||||
|
||||
All members of the FarmHash family were designed with heavy reliance on previous work by Jyrki Alakuijala, Austin Appleby, Bob Jenkins, and others.
|
||||
|
||||
For more information please consult https://github.com/google/farmhash
|
||||
|
||||
|
||||
## Getting started
|
||||
|
||||
This application is written in Go language, please refer to the guides in https://golang.org for getting started.
|
||||
|
||||
This project include a Makefile that allows you to test and build the project with simple commands.
|
||||
To see all available options:
|
||||
```bash
|
||||
make help
|
||||
```
|
||||
|
||||
## Running all tests
|
||||
|
||||
Before committing the code, please check if it passes all tests using
|
||||
```bash
|
||||
make qa
|
||||
```
|
|
@ -0,0 +1 @@
|
|||
2.0.1
|
|
@ -0,0 +1,30 @@
|
|||
package farm
|
||||
|
||||
// Some primes between 2^63 and 2^64 for various uses.
|
||||
const k0 uint64 = 0xc3a5c85c97cb3127
|
||||
const k1 uint64 = 0xb492b66fbe98f273
|
||||
const k2 uint64 = 0x9ae16a3b2f90404f
|
||||
|
||||
// Magic numbers for 32-bit hashing. Copied from Murmur3.
|
||||
const c1 uint32 = 0xcc9e2d51
|
||||
const c2 uint32 = 0x1b873593
|
||||
|
||||
// A 32-bit to 32-bit integer hash copied from Murmur3.
|
||||
func fmix(h uint32) uint32 {
|
||||
h ^= h >> 16
|
||||
h *= 0x85ebca6b
|
||||
h ^= h >> 13
|
||||
h *= 0xc2b2ae35
|
||||
h ^= h >> 16
|
||||
return h
|
||||
}
|
||||
|
||||
func mur(a, h uint32) uint32 {
|
||||
// Helper from Murmur3 for combining two 32-bit values.
|
||||
a *= c1
|
||||
a = rotate32(a, 17)
|
||||
a *= c2
|
||||
h ^= a
|
||||
h = rotate32(h, 19)
|
||||
return h*5 + 0xe6546b64
|
||||
}
|
|
@ -0,0 +1,199 @@
|
|||
package farm
|
||||
|
||||
// This file provides a 32-bit hash equivalent to CityHash32 (v1.1.1)
|
||||
// and a 128-bit hash equivalent to CityHash128 (v1.1.1). It also provides
|
||||
// a seeded 32-bit hash function similar to CityHash32.
|
||||
|
||||
func hash32Len13to24Seed(s []byte, seed uint32) uint32 {
|
||||
slen := len(s)
|
||||
a := fetch32(s, -4+(slen>>1))
|
||||
b := fetch32(s, 4)
|
||||
c := fetch32(s, slen-8)
|
||||
d := fetch32(s, (slen >> 1))
|
||||
e := fetch32(s, 0)
|
||||
f := fetch32(s, slen-4)
|
||||
h := d*c1 + uint32(slen) + seed
|
||||
a = rotate32(a, 12) + f
|
||||
h = mur(c, h) + a
|
||||
a = rotate32(a, 3) + c
|
||||
h = mur(e, h) + a
|
||||
a = rotate32(a+f, 12) + d
|
||||
h = mur(b^seed, h) + a
|
||||
return fmix(h)
|
||||
}
|
||||
|
||||
func hash32Len0to4(s []byte, seed uint32) uint32 {
|
||||
slen := len(s)
|
||||
b := seed
|
||||
c := uint32(9)
|
||||
for i := 0; i < slen; i++ {
|
||||
v := int8(s[i])
|
||||
b = (b * c1) + uint32(v)
|
||||
c ^= b
|
||||
}
|
||||
return fmix(mur(b, mur(uint32(slen), c)))
|
||||
}
|
||||
|
||||
func hash128to64(x uint128) uint64 {
|
||||
// Murmur-inspired hashing.
|
||||
const mul uint64 = 0x9ddfea08eb382d69
|
||||
a := (x.lo ^ x.hi) * mul
|
||||
a ^= (a >> 47)
|
||||
b := (x.hi ^ a) * mul
|
||||
b ^= (b >> 47)
|
||||
b *= mul
|
||||
return b
|
||||
}
|
||||
|
||||
type uint128 struct {
|
||||
lo uint64
|
||||
hi uint64
|
||||
}
|
||||
|
||||
// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings
|
||||
// of any length representable in signed long. Based on City and Murmur.
|
||||
func cityMurmur(s []byte, seed uint128) uint128 {
|
||||
slen := len(s)
|
||||
a := seed.lo
|
||||
b := seed.hi
|
||||
var c uint64
|
||||
var d uint64
|
||||
l := slen - 16
|
||||
if l <= 0 { // len <= 16
|
||||
a = shiftMix(a*k1) * k1
|
||||
c = b*k1 + hashLen0to16(s)
|
||||
if slen >= 8 {
|
||||
d = shiftMix(a + fetch64(s, 0))
|
||||
} else {
|
||||
d = shiftMix(a + c)
|
||||
}
|
||||
} else { // len > 16
|
||||
c = hashLen16(fetch64(s, slen-8)+k1, a)
|
||||
d = hashLen16(b+uint64(slen), c+fetch64(s, slen-16))
|
||||
a += d
|
||||
for {
|
||||
a ^= shiftMix(fetch64(s, 0)*k1) * k1
|
||||
a *= k1
|
||||
b ^= a
|
||||
c ^= shiftMix(fetch64(s, 8)*k1) * k1
|
||||
c *= k1
|
||||
d ^= c
|
||||
s = s[16:]
|
||||
l -= 16
|
||||
if l <= 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
a = hashLen16(a, c)
|
||||
b = hashLen16(d, b)
|
||||
return uint128{a ^ b, hashLen16(b, a)}
|
||||
}
|
||||
|
||||
func cityHash128WithSeed(s []byte, seed uint128) uint128 {
|
||||
slen := len(s)
|
||||
if slen < 128 {
|
||||
return cityMurmur(s, seed)
|
||||
}
|
||||
|
||||
endIdx := ((slen - 1) / 128) * 128
|
||||
lastBlockIdx := endIdx + ((slen - 1) & 127) - 127
|
||||
last := s[lastBlockIdx:]
|
||||
|
||||
// We expect len >= 128 to be the common case. Keep 56 bytes of state:
|
||||
// v, w, x, y, and z.
|
||||
var v1, v2 uint64
|
||||
var w1, w2 uint64
|
||||
x := seed.lo
|
||||
y := seed.hi
|
||||
z := uint64(slen) * k1
|
||||
v1 = rotate64(y^k1, 49)*k1 + fetch64(s, 0)
|
||||
v2 = rotate64(v1, 42)*k1 + fetch64(s, 8)
|
||||
w1 = rotate64(y+z, 35)*k1 + x
|
||||
w2 = rotate64(x+fetch64(s, 88), 53) * k1
|
||||
|
||||
// This is the same inner loop as CityHash64(), manually unrolled.
|
||||
for {
|
||||
x = rotate64(x+y+v1+fetch64(s, 8), 37) * k1
|
||||
y = rotate64(y+v2+fetch64(s, 48), 42) * k1
|
||||
x ^= w2
|
||||
y += v1 + fetch64(s, 40)
|
||||
z = rotate64(z+w1, 33) * k1
|
||||
v1, v2 = weakHashLen32WithSeeds(s, v2*k1, x+w1)
|
||||
w1, w2 = weakHashLen32WithSeeds(s[32:], z+w2, y+fetch64(s, 16))
|
||||
z, x = x, z
|
||||
s = s[64:]
|
||||
x = rotate64(x+y+v1+fetch64(s, 8), 37) * k1
|
||||
y = rotate64(y+v2+fetch64(s, 48), 42) * k1
|
||||
x ^= w2
|
||||
y += v1 + fetch64(s, 40)
|
||||
z = rotate64(z+w1, 33) * k1
|
||||
v1, v2 = weakHashLen32WithSeeds(s, v2*k1, x+w1)
|
||||
w1, w2 = weakHashLen32WithSeeds(s[32:], z+w2, y+fetch64(s, 16))
|
||||
z, x = x, z
|
||||
s = s[64:]
|
||||
slen -= 128
|
||||
if slen < 128 {
|
||||
break
|
||||
}
|
||||
}
|
||||
x += rotate64(v1+z, 49) * k0
|
||||
y = y*k0 + rotate64(w2, 37)
|
||||
z = z*k0 + rotate64(w1, 27)
|
||||
w1 *= 9
|
||||
v1 *= k0
|
||||
// If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
|
||||
for tailDone := 0; tailDone < slen; {
|
||||
tailDone += 32
|
||||
y = rotate64(x+y, 42)*k0 + v2
|
||||
w1 += fetch64(last, 128-tailDone+16)
|
||||
x = x*k0 + w1
|
||||
z += w2 + fetch64(last, 128-tailDone)
|
||||
w2 += v1
|
||||
v1, v2 = weakHashLen32WithSeeds(last[128-tailDone:], v1+z, v2)
|
||||
v1 *= k0
|
||||
}
|
||||
|
||||
// At this point our 56 bytes of state should contain more than
|
||||
// enough information for a strong 128-bit hash. We use two
|
||||
// different 56-byte-to-8-byte hashes to get a 16-byte final result.
|
||||
x = hashLen16(x, v1)
|
||||
y = hashLen16(y+z, w1)
|
||||
return uint128{hashLen16(x+v2, w2) + y,
|
||||
hashLen16(x+w2, y+v2)}
|
||||
}
|
||||
|
||||
func cityHash128(s []byte) uint128 {
|
||||
slen := len(s)
|
||||
if slen >= 16 {
|
||||
return cityHash128WithSeed(s[16:], uint128{fetch64(s, 0), fetch64(s, 8) + k0})
|
||||
}
|
||||
return cityHash128WithSeed(s, uint128{k0, k1})
|
||||
}
|
||||
|
||||
// Fingerprint128 is a 128-bit fingerprint function for byte-slices
|
||||
func Fingerprint128(s []byte) (lo, hi uint64) {
|
||||
h := cityHash128(s)
|
||||
return h.lo, h.hi
|
||||
}
|
||||
|
||||
// Fingerprint64 is a 64-bit fingerprint function for byte-slices
|
||||
func Fingerprint64(s []byte) uint64 {
|
||||
return naHash64(s)
|
||||
}
|
||||
|
||||
// Fingerprint32 is a 32-bit fingerprint function for byte-slices
|
||||
func Fingerprint32(s []byte) uint32 {
|
||||
return Hash32(s)
|
||||
}
|
||||
|
||||
// Hash128 is a 128-bit hash function for byte-slices
|
||||
func Hash128(s []byte) (lo, hi uint64) {
|
||||
return Fingerprint128(s)
|
||||
}
|
||||
|
||||
// Hash128WithSeed is a 128-bit hash function for byte-slices and a 128-bit seed
|
||||
func Hash128WithSeed(s []byte, seed0, seed1 uint64) (lo, hi uint64) {
|
||||
h := cityHash128WithSeed(s, uint128{seed0, seed1})
|
||||
return h.lo, h.hi
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
package farm
|
||||
|
||||
func hash32Len5to12(s []byte, seed uint32) uint32 {
|
||||
slen := len(s)
|
||||
a := uint32(len(s))
|
||||
b := uint32(len(s) * 5)
|
||||
c := uint32(9)
|
||||
d := b + seed
|
||||
a += fetch32(s, 0)
|
||||
b += fetch32(s, slen-4)
|
||||
c += fetch32(s, ((slen >> 1) & 4))
|
||||
return fmix(seed ^ mur(c, mur(b, mur(a, d))))
|
||||
}
|
||||
|
||||
// Hash32 hashes a byte slice and returns a uint32 hash value
|
||||
func Hash32(s []byte) uint32 {
|
||||
|
||||
slen := len(s)
|
||||
|
||||
if slen <= 24 {
|
||||
if slen <= 12 {
|
||||
if slen <= 4 {
|
||||
return hash32Len0to4(s, 0)
|
||||
}
|
||||
return hash32Len5to12(s, 0)
|
||||
}
|
||||
return hash32Len13to24Seed(s, 0)
|
||||
}
|
||||
|
||||
// len > 24
|
||||
h := uint32(slen)
|
||||
g := c1 * uint32(slen)
|
||||
f := g
|
||||
a0 := rotate32(fetch32(s, slen-4)*c1, 17) * c2
|
||||
a1 := rotate32(fetch32(s, slen-8)*c1, 17) * c2
|
||||
a2 := rotate32(fetch32(s, slen-16)*c1, 17) * c2
|
||||
a3 := rotate32(fetch32(s, slen-12)*c1, 17) * c2
|
||||
a4 := rotate32(fetch32(s, slen-20)*c1, 17) * c2
|
||||
h ^= a0
|
||||
h = rotate32(h, 19)
|
||||
h = h*5 + 0xe6546b64
|
||||
h ^= a2
|
||||
h = rotate32(h, 19)
|
||||
h = h*5 + 0xe6546b64
|
||||
g ^= a1
|
||||
g = rotate32(g, 19)
|
||||
g = g*5 + 0xe6546b64
|
||||
g ^= a3
|
||||
g = rotate32(g, 19)
|
||||
g = g*5 + 0xe6546b64
|
||||
f += a4
|
||||
f = rotate32(f, 19) + 113
|
||||
iters := (slen - 1) / 20
|
||||
for {
|
||||
a := fetch32(s, 0)
|
||||
b := fetch32(s, 4)
|
||||
c := fetch32(s, 8)
|
||||
d := fetch32(s, 12)
|
||||
e := fetch32(s, 16)
|
||||
h += a
|
||||
g += b
|
||||
f += c
|
||||
h = mur(d, h) + e
|
||||
g = mur(c, g) + a
|
||||
f = mur(b+e*c1, f) + d
|
||||
f += g
|
||||
g += f
|
||||
s = s[20:]
|
||||
iters--
|
||||
if iters == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
g = rotate32(g, 11) * c1
|
||||
g = rotate32(g, 17) * c1
|
||||
f = rotate32(f, 11) * c1
|
||||
f = rotate32(f, 17) * c1
|
||||
h = rotate32(h+g, 19)
|
||||
h = h*5 + 0xe6546b64
|
||||
h = rotate32(h, 17) * c1
|
||||
h = rotate32(h+f, 19)
|
||||
h = h*5 + 0xe6546b64
|
||||
h = rotate32(h, 17) * c1
|
||||
return h
|
||||
}
|
||||
|
||||
// Hash32WithSeed hashes a byte slice and a uint32 seed and returns a uint32 hash value
|
||||
func Hash32WithSeed(s []byte, seed uint32) uint32 {
|
||||
slen := len(s)
|
||||
|
||||
if slen <= 24 {
|
||||
if slen >= 13 {
|
||||
return hash32Len13to24Seed(s, seed*c1)
|
||||
}
|
||||
if slen >= 5 {
|
||||
return hash32Len5to12(s, seed)
|
||||
}
|
||||
return hash32Len0to4(s, seed)
|
||||
}
|
||||
h := hash32Len13to24Seed(s[:24], seed^uint32(slen))
|
||||
return mur(Hash32(s[24:])+seed, h)
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
package farm
|
||||
|
||||
func shiftMix(val uint64) uint64 {
|
||||
return val ^ (val >> 47)
|
||||
}
|
||||
|
||||
func hashLen16(u, v uint64) uint64 {
|
||||
return hash128to64(uint128{u, v})
|
||||
}
|
||||
|
||||
func hashLen16Mul(u, v, mul uint64) uint64 {
|
||||
// Murmur-inspired hashing.
|
||||
a := (u ^ v) * mul
|
||||
a ^= (a >> 47)
|
||||
b := (v ^ a) * mul
|
||||
b ^= (b >> 47)
|
||||
b *= mul
|
||||
return b
|
||||
}
|
||||
|
||||
func hashLen0to16(s []byte) uint64 {
|
||||
slen := uint64(len(s))
|
||||
if slen >= 8 {
|
||||
mul := k2 + slen*2
|
||||
a := fetch64(s, 0) + k2
|
||||
b := fetch64(s, int(slen-8))
|
||||
c := rotate64(b, 37)*mul + a
|
||||
d := (rotate64(a, 25) + b) * mul
|
||||
return hashLen16Mul(c, d, mul)
|
||||
}
|
||||
|
||||
if slen >= 4 {
|
||||
mul := k2 + slen*2
|
||||
a := fetch32(s, 0)
|
||||
return hashLen16Mul(slen+(uint64(a)<<3), uint64(fetch32(s, int(slen-4))), mul)
|
||||
}
|
||||
if slen > 0 {
|
||||
a := s[0]
|
||||
b := s[slen>>1]
|
||||
c := s[slen-1]
|
||||
y := uint32(a) + (uint32(b) << 8)
|
||||
z := uint32(slen) + (uint32(c) << 2)
|
||||
return shiftMix(uint64(y)*k2^uint64(z)*k0) * k2
|
||||
}
|
||||
return k2
|
||||
}
|
||||
|
||||
// This probably works well for 16-byte strings as well, but it may be overkill
|
||||
// in that case.
|
||||
func hashLen17to32(s []byte) uint64 {
|
||||
slen := len(s)
|
||||
mul := k2 + uint64(slen*2)
|
||||
a := fetch64(s, 0) * k1
|
||||
b := fetch64(s, 8)
|
||||
c := fetch64(s, slen-8) * mul
|
||||
d := fetch64(s, slen-16) * k2
|
||||
return hashLen16Mul(rotate64(a+b, 43)+rotate64(c, 30)+d, a+rotate64(b+k2, 18)+c, mul)
|
||||
}
|
||||
|
||||
// Return a 16-byte hash for 48 bytes. Quick and dirty.
|
||||
// Callers do best to use "random-looking" values for a and b.
|
||||
func weakHashLen32WithSeedsWords(w, x, y, z, a, b uint64) (uint64, uint64) {
|
||||
a += w
|
||||
b = rotate64(b+a+z, 21)
|
||||
c := a
|
||||
a += x
|
||||
a += y
|
||||
b += rotate64(a, 44)
|
||||
return a + z, b + c
|
||||
}
|
||||
|
||||
// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty.
|
||||
func weakHashLen32WithSeeds(s []byte, a, b uint64) (uint64, uint64) {
|
||||
return weakHashLen32WithSeedsWords(fetch64(s, 0),
|
||||
fetch64(s, 8),
|
||||
fetch64(s, 16),
|
||||
fetch64(s, 24),
|
||||
a,
|
||||
b)
|
||||
}
|
||||
|
||||
// Return an 8-byte hash for 33 to 64 bytes.
|
||||
func hashLen33to64(s []byte) uint64 {
|
||||
slen := len(s)
|
||||
mul := k2 + uint64(slen)*2
|
||||
a := fetch64(s, 0) * k2
|
||||
b := fetch64(s, 8)
|
||||
c := fetch64(s, slen-8) * mul
|
||||
d := fetch64(s, slen-16) * k2
|
||||
y := rotate64(a+b, 43) + rotate64(c, 30) + d
|
||||
z := hashLen16Mul(y, a+rotate64(b+k2, 18)+c, mul)
|
||||
e := fetch64(s, 16) * mul
|
||||
f := fetch64(s, 24)
|
||||
g := (y + fetch64(s, slen-32)) * mul
|
||||
h := (z + fetch64(s, slen-24)) * mul
|
||||
return hashLen16Mul(rotate64(e+f, 43)+rotate64(g, 30)+h, e+rotate64(f+a, 18)+g, mul)
|
||||
}
|
||||
|
||||
func naHash64(s []byte) uint64 {
|
||||
slen := len(s)
|
||||
var seed uint64 = 81
|
||||
if slen <= 32 {
|
||||
if slen <= 16 {
|
||||
return hashLen0to16(s)
|
||||
}
|
||||
return hashLen17to32(s)
|
||||
}
|
||||
if slen <= 64 {
|
||||
return hashLen33to64(s)
|
||||
}
|
||||
// For strings over 64 bytes we loop.
|
||||
// Internal state consists of 56 bytes: v, w, x, y, and z.
|
||||
v := uint128{0, 0}
|
||||
w := uint128{0, 0}
|
||||
x := seed*k2 + fetch64(s, 0)
|
||||
y := seed*k1 + 113
|
||||
z := shiftMix(y*k2+113) * k2
|
||||
// Set end so that after the loop we have 1 to 64 bytes left to process.
|
||||
endIdx := ((slen - 1) / 64) * 64
|
||||
last64Idx := endIdx + ((slen - 1) & 63) - 63
|
||||
last64 := s[last64Idx:]
|
||||
for len(s) > 64 {
|
||||
x = rotate64(x+y+v.lo+fetch64(s, 8), 37) * k1
|
||||
y = rotate64(y+v.hi+fetch64(s, 48), 42) * k1
|
||||
x ^= w.hi
|
||||
y += v.lo + fetch64(s, 40)
|
||||
z = rotate64(z+w.lo, 33) * k1
|
||||
v.lo, v.hi = weakHashLen32WithSeeds(s, v.hi*k1, x+w.lo)
|
||||
w.lo, w.hi = weakHashLen32WithSeeds(s[32:], z+w.hi, y+fetch64(s, 16))
|
||||
x, z = z, x
|
||||
s = s[64:]
|
||||
}
|
||||
mul := k1 + ((z & 0xff) << 1)
|
||||
// Make s point to the last 64 bytes of input.
|
||||
s = last64
|
||||
w.lo += (uint64(slen-1) & 63)
|
||||
v.lo += w.lo
|
||||
w.lo += v.lo
|
||||
x = rotate64(x+y+v.lo+fetch64(s, 8), 37) * mul
|
||||
y = rotate64(y+v.hi+fetch64(s, 48), 42) * mul
|
||||
x ^= w.hi * 9
|
||||
y += v.lo*9 + fetch64(s, 40)
|
||||
z = rotate64(z+w.lo, 33) * mul
|
||||
v.lo, v.hi = weakHashLen32WithSeeds(s, v.hi*mul, x+w.lo)
|
||||
w.lo, w.hi = weakHashLen32WithSeeds(s[32:], z+w.hi, y+fetch64(s, 16))
|
||||
x, z = z, x
|
||||
return hashLen16Mul(hashLen16Mul(v.lo, w.lo, mul)+shiftMix(y)*k0+z, hashLen16Mul(v.hi, w.hi, mul)+x, mul)
|
||||
}
|
||||
|
||||
func naHash64WithSeed(s []byte, seed uint64) uint64 {
|
||||
return naHash64WithSeeds(s, k2, seed)
|
||||
}
|
||||
|
||||
func naHash64WithSeeds(s []byte, seed0, seed1 uint64) uint64 {
|
||||
return hashLen16(naHash64(s)-seed0, seed1)
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
package farm
|
||||
|
||||
func uoH(x, y, mul uint64, r uint) uint64 {
|
||||
a := (x ^ y) * mul
|
||||
a ^= (a >> 47)
|
||||
b := (y ^ a) * mul
|
||||
return rotate64(b, r) * mul
|
||||
}
|
||||
|
||||
// Hash64WithSeeds hashes a byte slice and two uint64 seeds and returns a uint64 hash value
|
||||
func Hash64WithSeeds(s []byte, seed0, seed1 uint64) uint64 {
|
||||
slen := len(s)
|
||||
if slen <= 64 {
|
||||
return naHash64WithSeeds(s, seed0, seed1)
|
||||
}
|
||||
|
||||
// For strings over 64 bytes we loop.
|
||||
// Internal state consists of 64 bytes: u, v, w, x, y, and z.
|
||||
x := seed0
|
||||
y := seed1*k2 + 113
|
||||
z := shiftMix(y*k2) * k2
|
||||
v := uint128{seed0, seed1}
|
||||
var w uint128
|
||||
u := x - z
|
||||
x *= k2
|
||||
mul := k2 + (u & 0x82)
|
||||
|
||||
// Set end so that after the loop we have 1 to 64 bytes left to process.
|
||||
endIdx := ((slen - 1) / 64) * 64
|
||||
last64Idx := endIdx + ((slen - 1) & 63) - 63
|
||||
last64 := s[last64Idx:]
|
||||
|
||||
for len(s) > 64 {
|
||||
a0 := fetch64(s, 0)
|
||||
a1 := fetch64(s, 8)
|
||||
a2 := fetch64(s, 16)
|
||||
a3 := fetch64(s, 24)
|
||||
a4 := fetch64(s, 32)
|
||||
a5 := fetch64(s, 40)
|
||||
a6 := fetch64(s, 48)
|
||||
a7 := fetch64(s, 56)
|
||||
x += a0 + a1
|
||||
y += a2
|
||||
z += a3
|
||||
v.lo += a4
|
||||
v.hi += a5 + a1
|
||||
w.lo += a6
|
||||
w.hi += a7
|
||||
|
||||
x = rotate64(x, 26)
|
||||
x *= 9
|
||||
y = rotate64(y, 29)
|
||||
z *= mul
|
||||
v.lo = rotate64(v.lo, 33)
|
||||
v.hi = rotate64(v.hi, 30)
|
||||
w.lo ^= x
|
||||
w.lo *= 9
|
||||
z = rotate64(z, 32)
|
||||
z += w.hi
|
||||
w.hi += z
|
||||
z *= 9
|
||||
u, y = y, u
|
||||
|
||||
z += a0 + a6
|
||||
v.lo += a2
|
||||
v.hi += a3
|
||||
w.lo += a4
|
||||
w.hi += a5 + a6
|
||||
x += a1
|
||||
y += a7
|
||||
|
||||
y += v.lo
|
||||
v.lo += x - y
|
||||
v.hi += w.lo
|
||||
w.lo += v.hi
|
||||
w.hi += x - y
|
||||
x += w.hi
|
||||
w.hi = rotate64(w.hi, 34)
|
||||
u, z = z, u
|
||||
s = s[64:]
|
||||
}
|
||||
// Make s point to the last 64 bytes of input.
|
||||
s = last64
|
||||
u *= 9
|
||||
v.hi = rotate64(v.hi, 28)
|
||||
v.lo = rotate64(v.lo, 20)
|
||||
w.lo += (uint64(slen-1) & 63)
|
||||
u += y
|
||||
y += u
|
||||
x = rotate64(y-x+v.lo+fetch64(s, 8), 37) * mul
|
||||
y = rotate64(y^v.hi^fetch64(s, 48), 42) * mul
|
||||
x ^= w.hi * 9
|
||||
y += v.lo + fetch64(s, 40)
|
||||
z = rotate64(z+w.lo, 33) * mul
|
||||
v.lo, v.hi = weakHashLen32WithSeeds(s, v.hi*mul, x+w.lo)
|
||||
w.lo, w.hi = weakHashLen32WithSeeds(s[32:], z+w.hi, y+fetch64(s, 16))
|
||||
return uoH(hashLen16Mul(v.lo+x, w.lo^y, mul)+z-u,
|
||||
uoH(v.hi+y, w.hi+z, k2, 30)^x,
|
||||
k2,
|
||||
31)
|
||||
}
|
||||
|
||||
// Hash64WithSeed hashes a byte slice and a uint64 seed and returns a uint64 hash value
|
||||
func Hash64WithSeed(s []byte, seed uint64) uint64 {
|
||||
if len(s) <= 64 {
|
||||
return naHash64WithSeed(s, seed)
|
||||
}
|
||||
return Hash64WithSeeds(s, 0, seed)
|
||||
}
|
||||
|
||||
// Hash64 hashes a byte slice and returns a uint64 hash value
|
||||
func Hash64(s []byte) uint64 {
|
||||
if len(s) <= 64 {
|
||||
return naHash64(s)
|
||||
}
|
||||
return Hash64WithSeeds(s, 81, 0)
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package farm
|
||||
|
||||
func rotate32(val uint32, shift uint) uint32 {
|
||||
return ((val >> shift) | (val << (32 - shift)))
|
||||
}
|
||||
|
||||
func rotate64(val uint64, shift uint) uint64 {
|
||||
return ((val >> shift) | (val << (64 - shift)))
|
||||
}
|
||||
|
||||
func fetch32(s []byte, idx int) uint32 {
|
||||
return uint32(s[idx+0]) | uint32(s[idx+1])<<8 | uint32(s[idx+2])<<16 | uint32(s[idx+3])<<24
|
||||
}
|
||||
|
||||
func fetch64(s []byte, idx int) uint64 {
|
||||
return uint64(s[idx+0]) | uint64(s[idx+1])<<8 | uint64(s[idx+2])<<16 | uint64(s[idx+3])<<24 |
|
||||
uint64(s[idx+4])<<32 | uint64(s[idx+5])<<40 | uint64(s[idx+6])<<48 | uint64(s[idx+7])<<56
|
||||
}
|
|
@ -2,12 +2,24 @@
|
|||
"comment": "",
|
||||
"ignore": "test",
|
||||
"package": [
|
||||
{
|
||||
"checksumSHA1": "+Bo3QheGAtKFk7QPb+pdIEZNiYI=",
|
||||
"path": "github.com/AndreasBriese/bbloom",
|
||||
"revision": "28f7e881ca57bc00e028f9ede9f0d9104cfeef5e",
|
||||
"revisionTime": "2017-07-02T08:40:17Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "DBuGaMCW5qcB56KbHNMdfpF797Y=",
|
||||
"path": "github.com/apex/gateway",
|
||||
"revision": "e9c6ccec8851cbed57b52e96858f3dc0572921dc",
|
||||
"revisionTime": "2018-01-21T23:10:47Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "WBKIBKW4e4CON63PqRvGEvmg7Yc=",
|
||||
"path": "github.com/appleboy/com/convert",
|
||||
"revision": "c0b5901f9622d5256343198ed54af65af5d08cff",
|
||||
"revisionTime": "2018-04-10T03:06:38Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "1Jql7x7zDOmbDxGr4gsa8rFEC5g=",
|
||||
"path": "github.com/appleboy/go-fcm",
|
||||
|
@ -100,12 +112,54 @@
|
|||
"revision": "346938d642f2ec3594ed81d874461961cd0faa76",
|
||||
"revisionTime": "2016-10-29T20:57:26Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "7vwfpEh4rCpq6gS0D7qY+UhvAR4=",
|
||||
"path": "github.com/dgraph-io/badger",
|
||||
"revision": "deb140b6378159c58f803c01430a1b6ba3af248e",
|
||||
"revisionTime": "2018-04-06T05:54:07Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "oOuT7ebEiZ1ViHLKdFxKFOvobAQ=",
|
||||
"path": "github.com/dgraph-io/badger/options",
|
||||
"revision": "deb140b6378159c58f803c01430a1b6ba3af248e",
|
||||
"revisionTime": "2018-04-06T05:54:07Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "gGTDnTVVw5kcT2P5NXZV1YSckOU=",
|
||||
"path": "github.com/dgraph-io/badger/protos",
|
||||
"revision": "deb140b6378159c58f803c01430a1b6ba3af248e",
|
||||
"revisionTime": "2018-04-06T05:54:07Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "xBLLiAouTGA/lCAcQ5qjxQiuPCc=",
|
||||
"path": "github.com/dgraph-io/badger/skl",
|
||||
"revision": "deb140b6378159c58f803c01430a1b6ba3af248e",
|
||||
"revisionTime": "2018-04-06T05:54:07Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "I33KkP2lnYqJDasvvsAlebzkeko=",
|
||||
"path": "github.com/dgraph-io/badger/table",
|
||||
"revision": "deb140b6378159c58f803c01430a1b6ba3af248e",
|
||||
"revisionTime": "2018-04-06T05:54:07Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "gIZmO4zSlwCmPC99HpuBL8H6kjM=",
|
||||
"path": "github.com/dgraph-io/badger/y",
|
||||
"revision": "deb140b6378159c58f803c01430a1b6ba3af248e",
|
||||
"revisionTime": "2018-04-06T05:54:07Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "+TKtBzv23ywvmmqRiGEjUba4YmI=",
|
||||
"path": "github.com/dgrijalva/jwt-go",
|
||||
"revision": "dbeaa9332f19a944acb5736b4456cfcc02140e29",
|
||||
"revisionTime": "2017-10-19T21:57:19Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "+MUIij/Y6vqAkx0P5GjHeqnU9ZU=",
|
||||
"path": "github.com/dgryski/go-farm",
|
||||
"revision": "2de33835d10275975374b37b2dcfd22c9020a1f5",
|
||||
"revisionTime": "2018-01-09T07:02:41Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "rRYbBQkVsd/+EIryahsywNi34Lo=",
|
||||
"path": "github.com/emirpasic/gods/containers",
|
||||
|
|
Loading…
Reference in New Issue