diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..4c0f8e8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,20 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior. Include code samples here when possible. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bbcbbe7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..713fcc0 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,9 @@ +**Description:** + +**Link to tracking Issue:** + +**Testing:** < Describe what testing was performed and which tests were added.> + +**Documentation:** < Describe the documentation added.> \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..48e9074 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,32 @@ +name: build + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + + build: + strategy: + matrix: + go-version: [1.21.3] + name: Build + runs-on: ubuntu-latest + steps: + + - name: Set up Go 1.x + uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go-version }} + id: go + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Build + run: make ci + + - name: Code coverage + run: bash <(curl -s https://codecov.io/bash) diff --git a/.github/workflows/build.yml~ b/.github/workflows/build.yml~ new file mode 100644 index 0000000..f3d87dc --- /dev/null +++ b/.github/workflows/build.yml~ @@ -0,0 +1,38 @@ +name: build + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + + build: + strategy: + matrix: + go-version: [1.21.3] + name: Build + runs-on: ubuntu-latest + steps: + + - name: Set up Go 1.x + uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go-version }} + id: go + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Build + run: make ci + + - name: Code coverage + run: bash <(curl -s https://codecov.io/bash) + + - name: Send telemetry to Lightstep + uses: codeboten/github-action-to-otlp@v1 + with: + endpoint: "ingest.lightstep.com:443" + headers: "lightstep-access-token=${{ secrets.ACCESS_TOKEN }}" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7a3bfdc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +go.work +go.work.sum + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index c9770ee..131b6b3 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,58 @@ +[![Docs](https://godoc.org/github.com/lightstep/varopt?status.svg)](https://godoc.org/github.com/lightstep/varopt) + +# VarOpt Sampling Algorithm + This is an implementation of VarOpt, an unbiased weighted sampling algorithm described in the paper [Stream sampling for variance-optimal -estimation of subset sums](https://arxiv.org/pdf/0803.0473.pdf). +estimation of subset sums](https://arxiv.org/pdf/0803.0473.pdf) (2008) +by Edith Cohen, Nick Duffield, Haim Kaplan, Carsten Lund, and Mikkel +Thorup. + +VarOpt is a reservoir-type sampler that maintains a fixed-size sample +and provides a mechanism for merging unequal-weight samples. + +This repository also includes a simple reservoir sampling algorithm, +often useful in conjunction with weighed reservoir sampling, that +implements Algorithm R from [Random sampling with a +reservoir](https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R) +(1985) by Jeffrey Vitter. + +## Usage: Natural Weights + +A typical use of VarOpt sampling is to estimate network flows using +sample packets. In this use-case, the weight applied to each sample +is the size of the packet. Because VarOpt computes an unbiased +sample, sample data points can be summarized along secondary +dimensions. For example, we can select a subset of sampled packets +according to a secondary attribute, sum the sample weights, and the +result is expected to equal the size of packets corresponding to the +secondary attribute from the original population. + +See [weighted_test.go](https://github.com/lightstep/varopt/blob/master/weighted_test.go) for an example. + +## Usage: Inverse-probability Weights + +Another use for VarOpt sampling uses inverse-probability weights to +estimate frequencies while simultaneously controlling sample +diversity. Suppose a sequence of observations can be naturally +categorized into N different buckets. The goal in this case is to +compute a sample where each bucket is well represented, while +maintaining frequency estimates. + +In this use-case, the weight assigned to each observation is the +inverse probability of the bucket it belongs to. The result of +weighted sampling with inverse-probability weights is a uniform +expectated value; in this example we expect an equal number of +observations falling into each bucket. Each observation represents a +frequency of its sample weight (computed by VarOpt) divided by its +original weight (the inverse-probability). + +See [frequency_test.go](https://github.com/lightstep/varopt/blob/master/frequency_test.go) for an example. + +## Usage: Merging Samples + +VarOpt supports merging independently collected samples one +observation at a time. This is useful for building distributed +sampling schemes. In this use-case, each node in a distributed system +computes a weighted sample. To combine samples, simply input all the +observations and their corresponding weights into a new VarOpt sample. \ No newline at end of file diff --git a/benchmark_test.go b/benchmark_test.go new file mode 100644 index 0000000..5ca86fe --- /dev/null +++ b/benchmark_test.go @@ -0,0 +1,74 @@ +// Copyright 2019, LightStep Inc. +// +// The benchmark results point to a performance drop when the +// largeHeap starts to be used because of interface conversions in and +// out of the heap, primarily due to the heap interface. This +// suggests room for improvement by avoiding the built-in heap. + +/* +BenchmarkAdd_Norm_100-8 37540165 32.1 ns/op 8 B/op 0 allocs/op +BenchmarkAdd_Norm_10000-8 39850280 30.6 ns/op 8 B/op 0 allocs/op +BenchmarkAdd_Norm_1000000-8 7958835 183 ns/op 52 B/op 0 allocs/op +BenchmarkAdd_Exp_100-8 41565934 28.5 ns/op 8 B/op 0 allocs/op +BenchmarkAdd_Exp_10000-8 43622184 29.2 ns/op 8 B/op 0 allocs/op +BenchmarkAdd_Exp_1000000-8 8103663 220 ns/op 55 B/op 0 allocs/op +*/ + +package varopt_test + +import ( + "math" + "math/rand" + "testing" + + "github.com/lightstep/varopt" +) + +func normValue(rnd *rand.Rand) float64 { + return 1 + math.Abs(rnd.NormFloat64()) +} + +func expValue(rnd *rand.Rand) float64 { + return rnd.ExpFloat64() +} + +func BenchmarkAdd_Norm_100(b *testing.B) { + benchmarkAdd(b, 100, normValue) +} + +func BenchmarkAdd_Norm_10000(b *testing.B) { + benchmarkAdd(b, 10000, normValue) +} + +func BenchmarkAdd_Norm_1000000(b *testing.B) { + benchmarkAdd(b, 1000000, normValue) +} + +func BenchmarkAdd_Exp_100(b *testing.B) { + benchmarkAdd(b, 100, expValue) +} + +func BenchmarkAdd_Exp_10000(b *testing.B) { + benchmarkAdd(b, 10000, expValue) +} + +func BenchmarkAdd_Exp_1000000(b *testing.B) { + benchmarkAdd(b, 1000000, expValue) +} + +type thing struct{} + +func benchmarkAdd(b *testing.B, size int, f func(rnd *rand.Rand) float64) { + b.ReportAllocs() + rnd := rand.New(rand.NewSource(3331)) + v := varopt.New[thing](size, rnd) + weights := make([]float64, b.N) + for i := 0; i < b.N; i++ { + weights[i] = f(rnd) + } + + b.StartTimer() + for i := 0; i < b.N; i++ { + v.Add(thing{}, weights[i]) + } +} diff --git a/doc.go b/doc.go new file mode 100644 index 0000000..fe0e2e6 --- /dev/null +++ b/doc.go @@ -0,0 +1,22 @@ +// Copyright 2019, LightStep Inc. + +/* +Package varopt contains an implementation of VarOpt, an unbiased weighted +sampling algorithm described in the paper "Stream sampling for +variance-optimal estimation of subset sums" +https://arxiv.org/pdf/0803.0473.pdf (2008), by Edith Cohen, Nick +Duffield, Haim Kaplan, Carsten Lund, and Mikkel Thorup. + +VarOpt is a reservoir-type sampler that maintains a fixed-size sample +and provides a mechanism for merging unequal-weight samples. + +This package also includes a simple reservoir sampling algorithm, +often useful in conjunction with weighed reservoir sampling, using +Algorithm R from "Random sampling with a +reservoir", https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R +(1985), by Jeffrey Vitter. + +See https://github.com/lightstep/varopt/blob/master/README.md for +more detail. +*/ +package varopt diff --git a/frequency_test.go b/frequency_test.go new file mode 100644 index 0000000..a8b0f3a --- /dev/null +++ b/frequency_test.go @@ -0,0 +1,142 @@ +// Copyright 2019, LightStep Inc. + +package varopt_test + +import ( + "fmt" + "math" + "math/rand" + + "github.com/lightstep/varopt" +) + +type curve struct { + color string + mean float64 + stddev float64 +} + +type testPoint struct { + color int + xvalue float64 +} + +var colors = []curve{ + {color: "red", mean: 10, stddev: 15}, + {color: "green", mean: 30, stddev: 10}, + {color: "blue", mean: 50, stddev: 20}, +} + +// This example shows how to use Varopt sampling to estimate +// frequencies with the use of inverse probability weights. The use +// of inverse probability creates a uniform expected value, in this of +// the number of sample points per second. +// +// While the number of expected points per second is uniform, the +// output sample weights are expected to match the original +// frequencies. +func ExampleVaropt_GetOriginalWeight() { + // Number of points. + const totalCount = 1e6 + + // Relative size of the sample. + const sampleRatio = 0.01 + + // Ensure this test is deterministic. + rnd := rand.New(rand.NewSource(104729)) + + // Construct a timeseries consisting of three colored signals, + // for x=0 to x=60 seconds. + var points []testPoint + + // origCounts stores the original signals at second granularity. + origCounts := make([][]int, len(colors)) + for i := range colors { + origCounts[i] = make([]int, 60) + } + + // Construct the signals by choosing a random color, then + // using its Gaussian to compute a timestamp. + for len(points) < totalCount { + choose := rnd.Intn(len(colors)) + series := colors[choose] + xvalue := rnd.NormFloat64()*series.stddev + series.mean + + if xvalue < 0 || xvalue > 60 { + continue + } + origCounts[choose][int(math.Floor(xvalue))]++ + points = append(points, testPoint{ + color: choose, + xvalue: xvalue, + }) + } + + // Compute the total number of points per second. This will be + // used to establish the per-second probability. + xcount := make([]int, 60) + for _, point := range points { + xcount[int(math.Floor(point.xvalue))]++ + } + + // Compute the sample with using the inverse probability as a + // weight. This ensures a uniform distribution of points in each + // second. + sampleSize := int(sampleRatio * float64(totalCount)) + sampler := varopt.New[testPoint](sampleSize, rnd) + for _, point := range points { + second := int(math.Floor(point.xvalue)) + prob := float64(xcount[second]) / float64(totalCount) + sampler.Add(point, 1/prob) + } + + // sampleCounts stores the reconstructed signals. + sampleCounts := make([][]float64, len(colors)) + for i := range colors { + sampleCounts[i] = make([]float64, 60) + } + + // pointCounts stores the number of points per second. + pointCounts := make([]int, 60) + + // Reconstruct the signals using the output sample weights. + // The effective count of each sample point is its output + // weight divided by its original weight. + for i := 0; i < sampler.Size(); i++ { + point, weight := sampler.Get(i) + origWeight := sampler.GetOriginalWeight(i) + second := int(math.Floor(point.xvalue)) + sampleCounts[point.color][second] += (weight / origWeight) + pointCounts[second]++ + } + + // Compute standard deviation of sample points per second. + sum := 0.0 + mean := float64(sampleSize) / 60 + for s := 0; s < 60; s++ { + e := float64(pointCounts[s]) - mean + sum += e * e + } + stddev := math.Sqrt(sum / (60 - 1)) + + fmt.Printf("Samples per second mean %.2f\n", mean) + fmt.Printf("Samples per second standard deviation %.2f\n", stddev) + + // Compute mean absolute percentage error between sampleCounts + // and origCounts for each signal. + for c := range colors { + mae := 0.0 + for s := 0; s < 60; s++ { + mae += math.Abs(sampleCounts[c][s]-float64(origCounts[c][s])) / float64(origCounts[c][s]) + } + mae /= 60 + fmt.Printf("Mean absolute percentage error (%s) = %.2f%%\n", colors[c].color, mae*100) + } + + // Output: + // Samples per second mean 166.67 + // Samples per second standard deviation 13.75 + // Mean absolute percentage error (red) = 25.16% + // Mean absolute percentage error (green) = 14.30% + // Mean absolute percentage error (blue) = 14.23% +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c60faa9 --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module github.com/lightstep/varopt + +go 1.21 + +require github.com/stretchr/testify v1.8.4 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/kr/pretty v0.3.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/rogpeppe/go-internal v1.11.0 // indirect + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..fa915d6 --- /dev/null +++ b/go.sum @@ -0,0 +1,23 @@ +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/sampleheap.go b/internal/sampleheap.go new file mode 100644 index 0000000..3b9a201 --- /dev/null +++ b/internal/sampleheap.go @@ -0,0 +1,57 @@ +// Copyright 2019, LightStep Inc. + +package internal + +type Vsample[T any] struct { + Sample T + Weight float64 +} + +type SampleHeap[T any] []Vsample[T] + +func (sh *SampleHeap[T]) Push(v Vsample[T]) { + l := append(*sh, v) + n := len(l) - 1 + + // This copies the body of heap.up(). + j := n + for { + i := (j - 1) / 2 // parent + if i == j || l[j].Weight >= l[i].Weight { + break + } + l[i], l[j] = l[j], l[i] + j = i + } + + *sh = l +} + +func (sh *SampleHeap[T]) Pop() Vsample[T] { + l := *sh + n := len(l) - 1 + result := l[0] + l[0] = l[n] + l = l[:n] + + // This copies the body of heap.down(). + i := 0 + for { + j1 := 2*i + 1 + if j1 >= n || j1 < 0 { // j1 < 0 after int overflow + break + } + j := j1 // left child + if j2 := j1 + 1; j2 < n && l[j2].Weight < l[j1].Weight { + j = j2 // = 2*i + 2 // right child + } + if l[j].Weight >= l[i].Weight { + break + } + l[i], l[j] = l[j], l[i] + i = j + } + + *sh = l + return result +} diff --git a/internal/sampleheap_test.go b/internal/sampleheap_test.go new file mode 100644 index 0000000..d9647a1 --- /dev/null +++ b/internal/sampleheap_test.go @@ -0,0 +1,61 @@ +// Copyright 2019, LightStep Inc. + +package internal_test + +import ( + "container/heap" + "math/rand" + "testing" + + "github.com/lightstep/varopt/internal" + "github.com/stretchr/testify/require" +) + +type simpleHeap []float64 + +func (s *simpleHeap) Len() int { + return len(*s) +} + +func (s *simpleHeap) Swap(i, j int) { + (*s)[i], (*s)[j] = (*s)[j], (*s)[i] +} + +func (s *simpleHeap) Less(i, j int) bool { + return (*s)[i] < (*s)[j] +} + +func (s *simpleHeap) Push(x interface{}) { + *s = append(*s, x.(float64)) +} + +func (s *simpleHeap) Pop() interface{} { + old := *s + n := len(old) + x := old[n-1] + *s = old[0 : n-1] + return x +} + +func TestLargeHeap(t *testing.T) { + var L internal.SampleHeap[float64] + var S simpleHeap + + for i := 0; i < 1e6; i++ { + v := rand.NormFloat64() + L.Push(internal.Vsample[float64]{ + Sample: v, + Weight: v, + }) + heap.Push(&S, v) + } + + for len(S) > 0 { + v1 := heap.Pop(&S) + v2 := L.Pop().Weight + + require.Equal(t, v1, v2) + } + + require.Equal(t, 0, len(L)) +} diff --git a/simple/doc.go b/simple/doc.go new file mode 100644 index 0000000..0d79ebe --- /dev/null +++ b/simple/doc.go @@ -0,0 +1,4 @@ +// Copyright 2019, LightStep Inc. + +/* Package simple implements an unweighted reservoir sampling algorithm. */ +package simple diff --git a/simple/simple.go b/simple/simple.go new file mode 100644 index 0000000..f36c622 --- /dev/null +++ b/simple/simple.go @@ -0,0 +1,66 @@ +// Copyright 2019, LightStep Inc. + +package simple + +import ( + "math/rand" +) + +// Simple implements unweighted reservoir sampling using Algorithm R +// from "Random sampling with a reservoir" by Jeffrey Vitter (1985) +// https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R +type Simple[T any] struct { + capacity int + observed int + buffer []T + rnd *rand.Rand +} + +// New returns a simple reservoir sampler with given capacity +// (i.e., reservoir size) and random number generator. +func New[T any](capacity int, rnd *rand.Rand) *Simple[T] { + s := &Simple[T]{} + s.Init(capacity, rnd) + return s +} + +func (s *Simple[T]) Init(capacity int, rnd *rand.Rand) { + *s = Simple[T]{ + capacity: capacity, + buffer: make([]T, 0, s.capacity), + rnd: rnd, + } +} + +// Add considers a new observation for the sample. Items have unit +// weight. +func (s *Simple[T]) Add(item T) { + s.observed++ + + if len(s.buffer) < s.capacity { + s.buffer = append(s.buffer, item) + return + } + + // Give this a capacity/observed chance of replacing an existing entry. + index := s.rnd.Intn(s.observed) + if index < s.capacity { + s.buffer[index] = item + } +} + +// Get returns the i'th selected item from the sample. +func (s *Simple[T]) Get(i int) T { + return s.buffer[i] +} + +// Size returns the number of items in the sample. If the reservoir is +// full, Size() equals Capacity(). +func (s *Simple[T]) Size() int { + return len(s.buffer) +} + +// Count returns the number of items that were observed. +func (s *Simple[T]) Count() int { + return s.observed +} diff --git a/simple/simple_test.go b/simple/simple_test.go new file mode 100644 index 0000000..fe8cf01 --- /dev/null +++ b/simple/simple_test.go @@ -0,0 +1,39 @@ +// Copyright 2019, LightStep Inc. + +package simple_test + +import ( + "math/rand" + "testing" + + "github.com/lightstep/varopt/simple" + "github.com/stretchr/testify/require" +) + +func TestSimple(t *testing.T) { + const ( + popSize = 1e6 + sampleProb = 0.1 + sampleSize int = popSize * sampleProb + epsilon = 0.01 + ) + + rnd := rand.New(rand.NewSource(17167)) + + ss := simple.New[int](sampleSize, rnd) + + psum := 0. + for i := 0; i < popSize; i++ { + ss.Add(i) + psum += float64(i) + } + + require.Equal(t, ss.Size(), sampleSize) + + ssum := 0.0 + for i := 0; i < sampleSize; i++ { + ssum += float64(ss.Get(i)) + } + + require.InEpsilon(t, ssum/float64(ss.Size()), psum/popSize, epsilon) +} diff --git a/varopt.go b/varopt.go index d3cd2f1..e1fbd2a 100644 --- a/varopt.go +++ b/varopt.go @@ -1,25 +1,32 @@ -// Stream sampling for variance-optimal estimation of subset sums -// Edith Cohen, Nick Duffield, Haim Kaplan, Carsten Lund, Mikkel Thorup -// 2008 -// https://arxiv.org/pdf/0803.0473.pdf +// Copyright 2019, LightStep Inc. package varopt import ( - "container/heap" "fmt" + "math" "math/rand" + + "github.com/lightstep/varopt/internal" ) -type Varopt struct { - // Large-weight items - L largeHeap +// Varopt implements the algorithm from Stream sampling for +// variance-optimal estimation of subset sums Edith Cohen, Nick +// Duffield, Haim Kaplan, Carsten Lund, Mikkel Thorup 2008 +// +// https://arxiv.org/pdf/0803.0473.pdf +type Varopt[T any] struct { + // Random number generator + rnd *rand.Rand + + // Large-weight items stored in a min-heap. + L internal.SampleHeap[T] // Light-weight items. - T []vsample + T []internal.Vsample[T] // Temporary buffer. - X []vsample + X []internal.Vsample[T] // Current threshold tau float64 @@ -31,40 +38,76 @@ type Varopt struct { totalWeight float64 } -type vsample struct { - sample Sample - weight float64 -} +var ErrInvalidWeight = fmt.Errorf("Negative, Zero, Inf or NaN weight") -type largeHeap []vsample - -func NewVaropt(capacity int) *Varopt { - v := InitVaropt(capacity) - return &v +// New returns a new Varopt sampler with given capacity (i.e., +// reservoir size) and random number generator. +func New[T any](capacity int, rnd *rand.Rand) *Varopt[T] { + v := &Varopt[T]{} + v.Init(capacity, rnd) + return v } -func InitVaropt(capacity int) Varopt { - return Varopt{ +// Init initializes a Varopt[T] in-place, avoiding an allocation +// compared with New(). +func (v *Varopt[T]) Init(capacity int, rnd *rand.Rand) { + *v = Varopt[T]{ capacity: capacity, + rnd: rnd, + L: make(internal.SampleHeap[T], 0, capacity), + T: make(internal.SampleHeap[T], 0, capacity), } } -func (s *Varopt) Add(sample Sample, weight float64) { - individual := vsample{ - sample: sample, - weight: weight, +// Reset returns the sampler to its initial state, maintaining its +// capacity and random number source. +func (s *Varopt[T]) Reset() { + s.L = s.L[:0] + s.T = s.T[:0] + s.X = s.X[:0] + s.tau = 0 + s.totalCount = 0 + s.totalWeight = 0 +} + +// CopyFrom copies the fields of `from` into this Varopt[T]. +func (s *Varopt[T]) CopyFrom(from *Varopt[T]) { + // Copy non-slice fields + cpy := *from + // Keep existing slices, reset + cpy.L = s.L[:0] + cpy.T = s.T[:0] + cpy.X = s.X[:0] + // Append to existing slices + cpy.L = append(cpy.L, from.L...) + cpy.T = append(cpy.T, from.T...) + cpy.X = append(cpy.X, from.X...) + // Assign back to `s` + *s = cpy +} + +// Add considers a new observation for the sample with given weight. +// If there is an item ejected from the sample as a result, the item +// is returned to allow re-use of memory. +// +// An error will be returned if the weight is either negative or NaN. +func (s *Varopt[T]) Add(item T, weight float64) (T, error) { + var zero T + individual := internal.Vsample[T]{ + Sample: item, + Weight: weight, } - if weight <= 0 { - panic(fmt.Sprint("Invalid weight <= 0: ", weight)) + if weight <= 0 || math.IsNaN(weight) || math.IsInf(weight, 1) { + return zero, ErrInvalidWeight } s.totalCount++ s.totalWeight += weight if s.Size() < s.capacity { - heap.Push(&s.L, individual) - return + s.L.Push(individual) + return zero, nil } // the X <- {} step from the paper is not done here, @@ -73,16 +116,16 @@ func (s *Varopt) Add(sample Sample, weight float64) { W := s.tau * float64(len(s.T)) if weight > s.tau { - heap.Push(&s.L, individual) + s.L.Push(individual) } else { s.X = append(s.X, individual) W += weight } - for len(s.L) > 0 && W >= float64(len(s.T)+len(s.X)-1)*s.L[0].weight { - h := heap.Pop(&s.L).(vsample) + for len(s.L) > 0 && W >= float64(len(s.T)+len(s.X)-1)*s.L[0].Weight { + h := s.L.Pop() s.X = append(s.X, h) - W += h.weight + W += h.Weight } s.tau = W / float64(len(s.T)+len(s.X)-1) @@ -90,27 +133,31 @@ func (s *Varopt) Add(sample Sample, weight float64) { d := 0 for d < len(s.X) && r >= 0 { - wxd := s.X[d].weight + wxd := s.X[d].Weight r -= (1 - wxd/s.tau) d++ } + var eject T if r < 0 { if d < len(s.X) { s.X[d], s.X[len(s.X)-1] = s.X[len(s.X)-1], s.X[d] } + eject = s.X[len(s.X)-1].Sample s.X = s.X[:len(s.X)-1] } else { - ti := rand.Intn(len(s.T)) + ti := s.rnd.Intn(len(s.T)) s.T[ti], s.T[len(s.T)-1] = s.T[len(s.T)-1], s.T[ti] + eject = s.T[len(s.T)-1].Sample s.T = s.T[:len(s.T)-1] } s.T = append(s.T, s.X...) s.X = s.X[:0] + return eject, nil } -func (s *Varopt) uniform() float64 { +func (s *Varopt[T]) uniform() float64 { for { - r := rand.Float64() + r := s.rnd.Float64() if r != 0.0 { return r } @@ -120,62 +167,50 @@ func (s *Varopt) uniform() float64 { // Get() returns the i'th sample and its adjusted weight. To obtain // the sample's original weight (i.e. what was passed to Add), use // GetOriginalWeight(i). -func (s *Varopt) Get(i int) (Sample, float64) { +func (s *Varopt[T]) Get(i int) (T, float64) { if i < len(s.L) { - return s.L[i].sample, s.L[i].weight + return s.L[i].Sample, s.L[i].Weight } - return s.T[i-len(s.L)].sample, s.tau + return s.T[i-len(s.L)].Sample, s.tau } -func (s *Varopt) GetOriginalWeight(i int) float64 { +// GetOriginalWeight returns the original input weight of the sample +// item that was passed to Add(). This can be useful for computing a +// frequency from the adjusted sample weight. +func (s *Varopt[T]) GetOriginalWeight(i int) float64 { if i < len(s.L) { - return s.L[i].weight + return s.L[i].Weight } - return s.T[i-len(s.L)].weight + return s.T[i-len(s.L)].Weight } -func (s *Varopt) Capacity() int { +// Capacity returns the size of the reservoir. This is the maximum +// size of the sample. +func (s *Varopt[T]) Capacity() int { return s.capacity } -func (s *Varopt) Size() int { +// Size returns the current number of items in the sample. If the +// reservoir is full, this returns Capacity(). +func (s *Varopt[T]) Size() int { return len(s.L) + len(s.T) } -func (s *Varopt) TotalWeight() float64 { +// TotalWeight returns the sum of weights that were passed to Add(). +func (s *Varopt[T]) TotalWeight() float64 { return s.totalWeight } -func (s *Varopt) TotalCount() int { +// TotalCount returns the number of calls to Add(). +func (s *Varopt[T]) TotalCount() int { return s.totalCount } -func (s *Varopt) Tau() float64 { +// Tau returns the current large-weight threshold. Weights larger +// than Tau() carry their exact weight in the sample. See the VarOpt +// paper for details. +func (s *Varopt[T]) Tau() float64 { return s.tau } - -func (b largeHeap) Len() int { - return len(b) -} - -func (b largeHeap) Swap(i, j int) { - b[i], b[j] = b[j], b[i] -} - -func (b largeHeap) Less(i, j int) bool { - return b[i].weight < b[j].weight -} - -func (b *largeHeap) Push(x interface{}) { - *b = append(*b, x.(vsample)) -} - -func (b *largeHeap) Pop() interface{} { - old := *b - n := len(old) - x := old[n-1] - *b = old[0 : n-1] - return x -} diff --git a/varopt_test.go b/varopt_test.go index 4a19b80..867b7f7 100644 --- a/varopt_test.go +++ b/varopt_test.go @@ -1,9 +1,14 @@ +// Copyright 2019, LightStep Inc. + package varopt_test import ( "math" + "math/rand" "testing" + "github.com/lightstep/varopt" + "github.com/lightstep/varopt/simple" "github.com/stretchr/testify/require" ) @@ -11,20 +16,19 @@ import ( // There are odd and even numbers, in equal amount // There are last-digits 0-9 in equal amount // -// Much like simple_test.go, we will test the mean is correct and, -// because unbiased, also the odd/even and last-digit-0-9 groupings -// will be balanced. +// We will test the mean is correct and, because unbiased, also the +// odd/even and last-digit-0-9 groupings will be balanced. const ( numBlocks = 100 popSize = 1e7 sampleProb = 0.001 sampleSize int = popSize * sampleProb - // TODO epsilon is somewhat variable b/c we're using the - // static rand w/o a fixed seed for the test. - epsilon = 0.06 + epsilon = 0.08 ) +type testInt int + func TestUnbiased(t *testing.T) { var ( // Ratio of big blocks to small blocks @@ -53,12 +57,12 @@ func testUnbiased(t *testing.T, bbr, bsr float64) { extra = popSize - bigSize*numBig - smallSize*numSmall ) - population := make([]Sample, popSize) + population := make([]testInt, popSize) psum := 0.0 for i := range population { - population[i] = i + population[i] = testInt(i) psum += float64(i) } @@ -67,17 +71,17 @@ func testUnbiased(t *testing.T, bbr, bsr float64) { // population[i], population[j] = population[j], population[i] // }) - smallBlocks := make([][]Sample, numSmall) - bigBlocks := make([][]Sample, numBig) + smallBlocks := make([][]testInt, numSmall) + bigBlocks := make([][]testInt, numBig) for i := 0; i < numSmall; i++ { - smallBlocks[i] = make([]Sample, smallSize) + smallBlocks[i] = make([]testInt, smallSize) } for i := 0; i < numBig; i++ { if i == 0 { - bigBlocks[0] = make([]Sample, bigSize+extra) + bigBlocks[0] = make([]testInt, bigSize+extra) } else { - bigBlocks[i] = make([]Sample, bigSize) + bigBlocks[i] = make([]testInt, bigSize) } } @@ -97,22 +101,23 @@ func testUnbiased(t *testing.T, bbr, bsr float64) { require.Equal(t, len(population), pos) maxDiff := 0.0 + rnd := rand.New(rand.NewSource(98887)) - func(allBlockLists ...[][][]Sample) { + func(allBlockLists ...[][][]testInt) { for _, blockLists := range allBlockLists { - varopt := NewVaropt(sampleSize) + vsample := varopt.New[testInt](sampleSize, rnd) for _, blockList := range blockLists { for _, block := range blockList { - simple := NewSimple(sampleSize) + ss := simple.New[testInt](sampleSize, rnd) for _, s := range block { - simple.Add(s) + ss.Add(s) } - weight := simple.Weight() - for i := 0; i < simple.Size(); i++ { - varopt.Add(simple.Get(i), weight) + weight := float64(ss.Count()) / float64(ss.Size()) + for i := 0; i < ss.Size(); i++ { + vsample.Add(ss.Get(i), weight) } } } @@ -121,9 +126,9 @@ func testUnbiased(t *testing.T, bbr, bsr float64) { odd := 0.0 even := 0.0 - for i := 0; i < varopt.Size(); i++ { - v, w := varopt.Get(i) - vi := v.(int) + for i := 0; i < vsample.Size(); i++ { + v, w := vsample.Get(i) + vi := int(v) if vi%2 == 0 { even++ } else { @@ -140,7 +145,121 @@ func testUnbiased(t *testing.T, bbr, bsr float64) { require.InEpsilon(t, odd, even, epsilon) } }( - [][][]Sample{bigBlocks, smallBlocks}, - [][][]Sample{smallBlocks, bigBlocks}, + [][][]testInt{bigBlocks, smallBlocks}, + [][][]testInt{smallBlocks, bigBlocks}, ) } + +func TestInvalidWeight(t *testing.T) { + rnd := rand.New(rand.NewSource(98887)) + v := varopt.New[testInt](1, rnd) + + _, err := v.Add(1, math.NaN()) + require.Equal(t, err, varopt.ErrInvalidWeight) + + _, err = v.Add(1, -1) + require.Equal(t, err, varopt.ErrInvalidWeight) + + _, err = v.Add(1, 0) + require.Equal(t, err, varopt.ErrInvalidWeight) +} + +func TestReset(t *testing.T) { + const capacity = 10 + const insert = 100 + rnd := rand.New(rand.NewSource(98887)) + v := varopt.New[testInt](capacity, rnd) + + sum := 0. + for i := 1.; i <= insert; i++ { + v.Add(testInt(i), i) + sum += i + } + + require.Equal(t, capacity, v.Size()) + require.Equal(t, insert, v.TotalCount()) + require.Equal(t, sum, v.TotalWeight()) + require.Less(t, 0., v.Tau()) + + var v2 varopt.Varopt[testInt] + v2.Init(capacity, rnd) + v2.CopyFrom(v) + + var expect []testInt + for i := 0; i < v.Size(); i++ { + got, _ := v.Get(i) + expect = append(expect, got) + } + + v.Reset() + + require.Equal(t, 0, v.Size()) + require.Equal(t, 0, v.TotalCount()) + require.Equal(t, 0., v.TotalWeight()) + require.Equal(t, 0., v.Tau()) + + require.Equal(t, capacity, v2.Size()) + require.Equal(t, insert, v2.TotalCount()) + require.Equal(t, sum, v2.TotalWeight()) + require.Less(t, 0., v2.Tau()) + + var have []testInt + for i := 0; i < v2.Size(); i++ { + got, _ := v2.Get(i) + have = append(have, got) + } + require.Equal(t, expect, have) + +} + +func TestEject(t *testing.T) { + const capacity = 100 + const rounds = 10000 + const maxvalue = 10000 + + entries := make([]testInt, capacity+1) + freelist := make([]*testInt, capacity+1) + + for i := range entries { + freelist[i] = &entries[i] + } + + // Make two deterministically equal samplers + rnd1 := rand.New(rand.NewSource(98887)) + rnd2 := rand.New(rand.NewSource(98887)) + vsrc := rand.New(rand.NewSource(98887)) + + expected := varopt.New[*testInt](capacity, rnd1) + ejector := varopt.New[*testInt](capacity, rnd2) + + for i := 0; i < rounds; i++ { + value := testInt(vsrc.Intn(maxvalue)) + weight := vsrc.ExpFloat64() + + _, _ = expected.Add(&value, weight) + + lastitem := len(freelist) - 1 + item := freelist[lastitem] + freelist = freelist[:lastitem] + + *item = value + eject, _ := ejector.Add(item, weight) + + if eject != nil { + freelist = append(freelist, eject) + } + } + + require.Equal(t, expected.Size(), ejector.Size()) + require.Equal(t, expected.TotalCount(), ejector.TotalCount()) + require.Equal(t, expected.TotalWeight(), ejector.TotalWeight()) + require.Equal(t, expected.Tau(), ejector.Tau()) + + for i := 0; i < capacity; i++ { + expectItem, expectWeight := expected.Get(i) + ejectItem, ejectWeight := expected.Get(i) + + require.Equal(t, *expectItem, *ejectItem) + require.Equal(t, expectWeight, ejectWeight) + } +} diff --git a/weighted_test.go b/weighted_test.go new file mode 100644 index 0000000..1518723 --- /dev/null +++ b/weighted_test.go @@ -0,0 +1,81 @@ +// Copyright 2019, LightStep Inc. + +package varopt_test + +import ( + "fmt" + "math" + "math/rand" + + "github.com/lightstep/varopt" +) + +type packet struct { + size int + color string + protocol string +} + +func ExampleNew() { + const totalPackets = 1e6 + const sampleRatio = 0.01 + + colors := []string{"red", "green", "blue"} + protocols := []string{"http", "tcp", "udp"} + + sizeByColor := map[string]int{} + sizeByProtocol := map[string]int{} + trueTotalWeight := 0.0 + + rnd := rand.New(rand.NewSource(32491)) + sampler := varopt.New[packet](totalPackets*sampleRatio, rnd) + + for i := 0; i < totalPackets; i++ { + packet := packet{ + size: 1 + rnd.Intn(100000), + color: colors[rnd.Intn(len(colors))], + protocol: protocols[rnd.Intn(len(protocols))], + } + + sizeByColor[packet.color] += packet.size + sizeByProtocol[packet.protocol] += packet.size + trueTotalWeight += float64(packet.size) + + sampler.Add(packet, float64(packet.size)) + } + + estSizeByColor := map[string]float64{} + estSizeByProtocol := map[string]float64{} + estTotalWeight := 0.0 + + for i := 0; i < sampler.Size(); i++ { + packet, weight := sampler.Get(i) + estSizeByColor[packet.color] += weight + estSizeByProtocol[packet.protocol] += weight + estTotalWeight += weight + } + + // Compute mean average percentage error for colors + colorMape := 0.0 + for _, c := range colors { + colorMape += math.Abs(float64(sizeByColor[c])-estSizeByColor[c]) / float64(sizeByColor[c]) + } + colorMape /= float64(len(colors)) + + // Compute mean average percentage error for protocols + protocolMape := 0.0 + for _, p := range protocols { + protocolMape += math.Abs(float64(sizeByProtocol[p])-estSizeByProtocol[p]) / float64(sizeByProtocol[p]) + } + protocolMape /= float64(len(protocols)) + + // Compute total sum error percentage + fmt.Printf("Total sum error %.2g%%\n", 100*math.Abs(estTotalWeight-trueTotalWeight)/trueTotalWeight) + fmt.Printf("Color mean absolute percentage error %.2f%%\n", 100*colorMape) + fmt.Printf("Protocol mean absolute percentage error %.2f%%\n", 100*protocolMape) + + // Output: + // Total sum error 2.4e-11% + // Color mean absolute percentage error 0.73% + // Protocol mean absolute percentage error 1.62% +}