Merge pull request #7 from natexcvi/SP-25235-evaluation

Implement basic evaluation library
natexcvi · Aug 16, 2023 · f92afae · f92afae
2 parents 3c4fc4f + bfab09f
commit f92afae
Show file tree

Hide file tree

Showing 8 changed files with 490 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 .vscode
 **/.venv
 output.json
-log.txt
+log.txt
+.idea
diff --git a/README.md b/README.md
@@ -171,3 +171,60 @@ Agents are the main component of the library. Agents can perform complex tasks t
 
 ### Prebuilt (WIP)
 A collection of ready-made agents that can be easily integrated with your application.
+
+### Evaluation (WIP)
+A collection of evaluation tools for agents and engines.
+## Example
+```go
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/natexcvi/go-llm/engines"
+	"github.com/natexcvi/go-llm/evaluation"
+)
+
+func goodness(_ *engines.ChatPrompt, _ *engines.ChatMessage, err error) float64 {
+	if err != nil {
+		return 0
+	}
+
+	return 1
+}
+
+func main() {
+	engine := engines.NewGPTEngine(os.Getenv("OPENAI_TOKEN"), "gpt-3.5-turbo-0613")
+	engineRunner := evaluation.NewLLMRunner(engine)
+
+	evaluator := evaluation.NewEvaluator(engineRunner, &evaluation.Options[*engines.ChatPrompt, *engines.ChatMessage]{
+		GoodnessFunction: goodness,
+		Repetitions:      5,
+	})
+
+	testPack := []*engines.ChatPrompt{
+		{
+			History: []*engines.ChatMessage{
+				{
+					Text: "Hello, how are you?",
+				},
+				{
+					Text: "I'm trying to understand how this works.",
+				},
+			},
+		},
+		{
+			History: []*engines.ChatMessage{
+				{
+					Text: "Could you please explain it to me?",
+				},
+			},
+		},
+	}
+
+	results := evaluator.Evaluate(testPack)
+	fmt.Println("Goodness level of the first prompt:", results[0])
+	fmt.Println("Goodness level of the second prompt:", results[1])
+}
+```
diff --git a/evaluation/agent_evaluator.go b/evaluation/agent_evaluator.go
@@ -0,0 +1,20 @@
+package evaluation
+
+import (
+	"github.com/natexcvi/go-llm/agents"
+)
+
+type agentRunner[Input, Output any] struct {
+	agent agents.Agent[Input, Output]
+}
+
+// Returns a new agent runner that can be used to evaluate the output.
+func NewAgentRunner[Input, Output any](agent agents.Agent[Input, Output]) Runner[Input, Output] {
+	return &agentRunner[Input, Output]{
+		agent: agent,
+	}
+}
+
+func (t *agentRunner[Input, Output]) Run(input Input) (Output, error) {
+	return t.agent.Run(input)
+}
diff --git a/evaluation/evaluator.go b/evaluation/evaluator.go
@@ -0,0 +1,104 @@
+package evaluation
+
+import (
+	"fmt"
+	"github.com/samber/mo"
+)
+
+// GoodnessFunction is a function that takes an input, an output and an error (if one occurred) and returns a float64
+// which represents the goodness score of the output.
+type GoodnessFunction[Input, Output any] func(input Input, output Output, err error) float64
+
+// Options is a struct that contains the options for the evaluator.
+type Options[Input, Output any] struct {
+	// The goodness function that will be used to evaluate the output.
+	GoodnessFunction GoodnessFunction[Input, Output]
+	// The number of times the test will be repeated. The goodness level of each output will be
+	// averaged.
+	Repetitions int
+}
+
+// Runner is an interface that represents a test runner that will be used to evaluate the output.
+// It takes an input and returns an output and an error.
+type Runner[Input, Output any] interface {
+	Run(input Input) (Output, error)
+}
+
+// Evaluator is a struct that runs the tests and evaluates the outputs.
+type Evaluator[Input, Output any] struct {
+	options *Options[Input, Output]
+	runner  Runner[Input, Output]
+}
+
+// Creates a new `Evaluator` with the provided configuration.
+func NewEvaluator[Input, Output any](runner Runner[Input, Output], options *Options[Input, Output]) *Evaluator[Input, Output] {
+	return &Evaluator[Input, Output]{
+		options: options,
+		runner:  runner,
+	}
+}
+
+// Runs the tests and evaluates the outputs. The function receives a test pack
+// which is a slice of inputs and returns a slice of float64 which represents the goodness level
+// of each respective output.
+func (e *Evaluator[Input, Output]) Evaluate(testPack []Input) []float64 {
+	repetitionChannels := make([]chan []float64, e.options.Repetitions)
+
+	for i := 0; i < e.options.Repetitions; i++ {
+		repetitionChannels[i] = make(chan []float64)
+		go func(i int) {
+			report, err := e.evaluate(testPack)
+			if err != nil {
+				repetitionChannels[i] <- nil
+				return
+			}
+			repetitionChannels[i] <- report
+		}(i)
+	}
+
+	responses := make([][]float64, e.options.Repetitions)
+	for i := 0; i < e.options.Repetitions; i++ {
+		responses[i] = <-repetitionChannels[i]
+	}
+
+	report := make([]float64, len(testPack))
+	for i := 0; i < len(testPack); i++ {
+		sum := 0.0
+		for j := 0; j < e.options.Repetitions; j++ {
+			sum += responses[j][i]
+		}
+		report[i] = sum / float64(e.options.Repetitions)
+	}
+
+	return report
+}
+
+func (e *Evaluator[Input, Output]) evaluate(testPack []Input) ([]float64, error) {
+	responses, err := e.test(testPack)
+	if err != nil {
+		return nil, fmt.Errorf("failed to test: %w", err)
+	}
+
+	report := make([]float64, len(testPack))
+	for i, response := range responses {
+		res, resErr := response.Get()
+		report[i] = e.options.GoodnessFunction(testPack[i], res, resErr)
+	}
+
+	return report, nil
+}
+
+func (e *Evaluator[Input, Output]) test(testPack []Input) ([]mo.Result[Output], error) {
+	responses := make([]mo.Result[Output], len(testPack))
+
+	for i, test := range testPack {
+		response, err := e.runner.Run(test)
+		if err != nil {
+			responses[i] = mo.Err[Output](err)
+		} else {
+			responses[i] = mo.Ok(response)
+		}
+	}
+
+	return responses, nil
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,5 @@ @@
     .vscode
     **/.venv
     output.json
-    log.txt
+    log.txt
+    .idea