Skip to content

Commit

Permalink
Experimental stats output [#70] (#75)
Browse files Browse the repository at this point in the history
* Output a .tsv.gz of all non-duplicate ZXY tiles with their compressed length.
  • Loading branch information
bdon authored Sep 12, 2023
1 parent e53a394 commit dac421a
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ dist/
go-pmtiles
*.pmtiles
*.geojson
*.tsv.gz
19 changes: 17 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ var cli struct {
Overfetch float32 `default:0.05 help:"What ratio of extra data to download to minimize # requests; 0.2 is 20%"`
} `cmd:"" help:"Create an archive from a larger archive for a subset of zoom levels or geographic region."`

Makesync struct {
Input string `arg:"" type:"existingfile"`
BlockSize int `default:1000 help:"The block size, in # of tiles."`
HashFunction string `default:fnv1a help:"The hash function."`
} `cmd:"" help:"Generates an **experimental** sync control file (.pmtiles.sync) for a local archive."`

Stats struct {
Input string `arg:"" type:"existingfile"`
} `cmd:"" help:"Add a vector tile statistics file (.tilestats.tsv.gz) used for further analysis with DuckDB."`

Verify struct {
Input string `arg:"" help:"Input archive." type:"existingfile"`
} `cmd:"" help:"Verifies that a local archive is valid."`
Expand Down Expand Up @@ -91,12 +101,12 @@ func main() {
case "show <path>":
err := pmtiles.Show(logger, cli.Show.Bucket, cli.Show.Path, false, 0, 0, 0)
if err != nil {
logger.Fatalf("Failed to show database, %v", err)
logger.Fatalf("Failed to show archive, %v", err)
}
case "tile <path> <z> <x> <y>":
err := pmtiles.Show(logger, cli.Tile.Bucket, cli.Tile.Path, true, cli.Tile.Z, cli.Tile.X, cli.Tile.Y)
if err != nil {
logger.Fatalf("Failed to show database, %v", err)
logger.Fatalf("Failed to show tile, %v", err)
}
case "serve <path>":
server, err := pmtiles.NewServer(cli.Serve.Bucket, cli.Serve.Path, logger, cli.Serve.CacheSize, cli.Serve.Cors, cli.Serve.PublicHostname)
Expand Down Expand Up @@ -125,6 +135,11 @@ func main() {
if err != nil {
logger.Fatalf("Failed to extract, %v", err)
}
case "stats <input>":
err := pmtiles.Stats(logger, cli.Stats.Input)
if err != nil {
logger.Fatalf("Failed to stats archive, %v", err)
}
case "convert <input> <output>":
path := cli.Convert.Input
output := cli.Convert.Output
Expand Down
118 changes: 118 additions & 0 deletions pmtiles/stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package pmtiles

import (
"bytes"
"compress/gzip"
"context"
"encoding/csv"
"fmt"
"github.com/RoaringBitmap/roaring/roaring64"
"io"
"log"
"os"
"strconv"
"time"
)

func Stats(logger *log.Logger, file string) error {
start := time.Now()
ctx := context.Background()

bucketURL, key, err := NormalizeBucketKey("", "", file)

if err != nil {
return err
}

bucket, err := OpenBucket(ctx, bucketURL, "")

if err != nil {
return fmt.Errorf("Failed to open bucket for %s, %w", bucketURL, err)
}
defer bucket.Close()

r, err := bucket.NewRangeReader(ctx, key, 0, 16384)

if err != nil {
return fmt.Errorf("Failed to create range reader for %s, %w", key, err)
}
b, err := io.ReadAll(r)
if err != nil {
return fmt.Errorf("Failed to read %s, %w", key, err)
}
r.Close()

header, err := deserialize_header(b[0:HEADERV3_LEN_BYTES])

if header.TileType != Mvt {
return fmt.Errorf("Stats only works on MVT vector tilesets.")
}

// Pass 1: through the entire entry set, finding all non-duplicated tiles.

var CollectEntries func(uint64, uint64, func(EntryV3))

CollectEntries = func(dir_offset uint64, dir_length uint64, f func(EntryV3)) {
dirbytes, err := bucket.NewRangeReader(ctx, key, int64(dir_offset), int64(dir_length))
if err != nil {
panic(fmt.Errorf("I/O error"))
}
defer dirbytes.Close()
b, err = io.ReadAll(dirbytes)
if err != nil {
panic(fmt.Errorf("I/O Error"))
}

directory := deserialize_entries(bytes.NewBuffer(b))
for _, entry := range directory {
if entry.RunLength > 0 {
f(entry)
} else {
CollectEntries(header.LeafDirectoryOffset+entry.Offset, uint64(entry.Length), f)
}
}
}

seen_once := roaring64.New()
seen_twice := roaring64.New()
CollectEntries(header.RootOffset, header.RootLength, func(e EntryV3) {
if seen_once.Contains(e.Offset) {
seen_twice.Add(e.Offset)
}
seen_once.Add(e.Offset)
})

seen_once.AndNot(seen_twice)
fmt.Println("Non-duplicate tiles:", seen_once.GetCardinality())

// pass 2: decompress and parse tiles in order.

output, err := os.Create(file + ".stats.tsv.gz")
if err != nil {
return fmt.Errorf("Failed to create output")
}
defer output.Close()

gzWriter := gzip.NewWriter(output)
defer gzWriter.Close()

csvWriter := csv.NewWriter(gzWriter)
csvWriter.Comma = '\t'
defer csvWriter.Flush()
if err := csvWriter.Write([]string{"z", "x", "y", "bytes_compressed"}); err != nil {
return fmt.Errorf("Failed to write header to TSV: %v", err)
}

CollectEntries(header.RootOffset, header.RootLength, func(e EntryV3) {
if seen_once.Contains(e.Offset) {
z, x, y := IdToZxy(e.TileId)
row := []string{strconv.FormatUint(uint64(z), 10), strconv.FormatUint(uint64(x), 10), strconv.FormatUint(uint64(y), 10), strconv.FormatUint(uint64(e.Length), 10)}
if err := csvWriter.Write(row); err != nil {
panic(fmt.Errorf("Failed to write record to TSV: %v", err))
}
}
})

fmt.Printf("Completed stats in %v.\n", time.Since(start))
return nil
}

0 comments on commit dac421a

Please sign in to comment.