ethersphere · janos · Nov 29, 2018 · Dec 3, 2018 · Dec 3, 2018 · Dec 3, 2018
diff --git a/swarm/storage/localstore/doc.go b/swarm/storage/localstore/doc.go
@@ -0,0 +1,56 @@
+// Copyright 2019 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+/*
+Package localstore provides disk storage layer for Swarm Chunk persistence.
+It uses swarm/shed abstractions on top of github.com/syndtr/goleveldb LevelDB
+implementation.
+
+The main type is DB which manages the storage by providing methods to
+access and add Chunks and to manage their status.
+
+Modes are abstractions that do specific changes to Chunks. There are three
+mode types:
+
+ - ModeGet, for Chunk access
+ - ModePut, for adding Chunks to the database
+ - ModeSet, for changing Chunk statuses
+
+Every mode type has a corresponding type (Getter, Putter and Setter)
+that provides adequate method to perform the opperation and that type
+should be injected into localstore consumers instead the whole DB.
+This provides more clear insight which operations consumer is performing
+on the database.
+
+Getters, Putters and Setters accept different get, put and set modes
+to perform different actions. For example, ModeGet has two different
+variables ModeGetRequest and ModeGetSync and dwo different Getters
+can be constructed with them that are used when the chunk is requested
+or when the chunk is synced as this two events are differently changing
+the database.
+
+Subscription methods are implemented for a specific purpose of
+continuous iterations over Chunks that should be provided to
+Push and Pull syncing.
+
+DB implements an internal garbage collector that removes only synced
+Chunks from the database based on their most recent access time.
+
+Internally, DB stores Chunk data and any required information, such as
+store and access timestamps in different shed indexes that can be
+iterated on by garbage collector or subscriptions.
+*/
+package localstore
diff --git a/swarm/storage/localstore/gc.go b/swarm/storage/localstore/gc.go
@@ -0,0 +1,229 @@
+// Copyright 2018 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+package localstore
+
+import (
+	"time"
+
+	"github.com/ethereum/go-ethereum/log"
+	"github.com/ethereum/go-ethereum/swarm/shed"
+	"github.com/syndtr/goleveldb/leveldb"
+)
+
+var (
+	// gcTargetRatio defines the target number of items
+	// in garbage collection index that will not be removed
+	// on garbage collection. The target number of items
+	// is calculated by gcTarget function. This value must be
+	// in range (0,1]. For example, with 0.9 value,
+	// garbage collection will leave 90% of defined capacity
+	// in database after its run. This prevents frequent
+	// garbage collection runt.
+	gcTargetRatio = 0.9
+	// gcBatchSize limits the number of chunks in a single
+	// leveldb batch on garbage collection.
+	gcBatchSize int64 = 1000
+)
+
+// collectGarbageWorker is a long running function that waits for
+// collectGarbageTrigger channel to signal a garbage collection
+// run. GC run iterates on gcIndex and removes older items
+// form retrieval and other indexes.
+func (db *DB) collectGarbageWorker() {
+	for {
+		select {
+		case <-db.collectGarbageTrigger:
+			// run a single collect garbage run and
+			// if done is false, gcBatchSize is reached and
+			// another collect garbage run is needed
+			collectedCount, done, err := db.collectGarbage()
+			if err != nil {
+				log.Error("localstore collect garbage", "err", err)
+			}
+			// check if another gc run is needed
+			if !done {
+				db.triggerGarbageCollection()
+			}
+
+			if testHookCollectGarbage != nil {
+				testHookCollectGarbage(collectedCount)
+			}
+		case <-db.close:
+			return
+		}
+	}
+}
+
+// collectGarbage removes chunks from retrieval and other
+// indexes if maximal number of chunks in database is reached.
+// This function returns the number of removed chunks. If done
+// is false, another call to this function is needed to collect
+// the rest of the garbage as the batch size limit is reached.
+// This function is called in collectGarbageWorker.
+func (db *DB) collectGarbage() (collectedCount int64, done bool, err error) {
+	batch := new(leveldb.Batch)
+	target := db.gcTarget()
+
+	done = true
+	err = db.gcIndex.Iterate(func(item shed.Item) (stop bool, err error) {
+		// protect parallel updates
+		unlock, err := db.lockAddr(item.Address)
+		if err != nil {
+			return false, err
+		}
+		defer unlock()
+
+		gcSize := db.getGCSize()
+		if gcSize-collectedCount <= target {
+			return true, nil
+		}
+		// delete from retrieve, pull, gc
+		db.retrievalDataIndex.DeleteInBatch(batch, item)
+		db.retrievalAccessIndex.DeleteInBatch(batch, item)
+		db.pullIndex.DeleteInBatch(batch, item)
+		db.gcIndex.DeleteInBatch(batch, item)
+		collectedCount++
+		if collectedCount >= gcBatchSize {
+			// bach size limit reached,
+			// another gc run is needed
+			done = false
+			return true, nil
+		}
+		return false, nil
+	}, nil)
+	if err != nil {
+		return 0, false, err
+	}
+
+	err = db.shed.WriteBatch(batch)
+	if err != nil {
+		return 0, false, err
+	}
+	// batch is written, decrement gcSize
+	db.incGCSize(-collectedCount)
+	return collectedCount, done, nil
+}
+
+// gcTrigger retruns the absolute value for garbage collection
+// target value, calculated from db.capacity and gcTargetRatio.
+func (db *DB) gcTarget() (target int64) {
+	return int64(float64(db.capacity) * gcTargetRatio)
+}
+
+// incGCSize increments gcSize by the provided number.
+// If count is negative, it will decrement gcSize.
+func (db *DB) incGCSize(count int64) {
+	if count == 0 {
+		return
+	}
+
+	db.gcSizeMu.Lock()
+	new := db.gcSize + count
+	db.gcSize = new
+	db.gcSizeMu.Unlock()
+
+	select {
+	case db.writeGCSizeTrigger <- struct{}{}:
+	default:
+	}
+	if new >= db.capacity {
+		db.triggerGarbageCollection()
+	}
+}
+
+// getGCSize returns gcSize value by locking it
+// with gcSizeMu mutex.
+func (db *DB) getGCSize() (count int64) {
+	db.gcSizeMu.RLock()
+	count = db.gcSize
+	db.gcSizeMu.RUnlock()
+	return count
+}
+
+// triggerGarbageCollection signals collectGarbageWorker
+// to call collectGarbage.
+func (db *DB) triggerGarbageCollection() {
+	select {
+	case db.collectGarbageTrigger <- struct{}{}:
+	case <-db.close:
+	default:
+	}
+}
+
+// writeGCSizeWorker writes gcSize on trigger event
+// and waits writeGCSizeDelay after each write.
+// It implements a linear backoff with delay of
+// writeGCSizeDelay duration to avoid very frequent
+// database operations.
+func (db *DB) writeGCSizeWorker() {
+	for {
+		select {
+		case <-db.writeGCSizeTrigger:
+			err := db.writeGCSize(db.getGCSize())
+			if err != nil {
+				log.Error("localstore write gc size", "err", err)
+			}
+			// Wait some time before writing gc size in the next
+			// iteration. This prevents frequent I/O operations.
+			select {
+			case <-time.After(10 * time.Second):
+			case <-db.close:
+				return
+			}
+		case <-db.close:
+			return
+		}
+	}
+}
+
+// writeGCSize stores the number of items in gcIndex.
+// It removes all hashes from gcUncountedHashesIndex
+// not to include them on the next database initialization
+// when gcSize is counted.
+func (db *DB) writeGCSize(gcSize int64) (err error) {
+	const maxBatchSize = 1000
+
+	batch := new(leveldb.Batch)
+	db.storedGCSize.PutInBatch(batch, uint64(gcSize))
+	batchSize := 1
+
+	// use only one iterator as it acquires its snapshot
+	// not to remove hashes from index that are added
+	// after stored gc size is written
+	err = db.gcUncountedHashesIndex.Iterate(func(item shed.Item) (stop bool, err error) {
+		db.gcUncountedHashesIndex.DeleteInBatch(batch, item)
+		batchSize++
+		if batchSize >= maxBatchSize {
+			err = db.shed.WriteBatch(batch)
+			if err != nil {
+				return false, err
+			}
+			batch.Reset()
+			batchSize = 0
+		}
+		return false, nil
+	}, nil)
+	if err != nil {
+		return err
+	}
+	return db.shed.WriteBatch(batch)
+}
+
+// testHookCollectGarbage is a hook that can provide
+// information when a garbage collection run is done
+// and how many items it removed.
+var testHookCollectGarbage func(collectedCount int64)