Skip to content

Commit

Permalink
Merge pull request #7 from natexcvi/SP-25235-evaluation
Browse files Browse the repository at this point in the history
Implement basic evaluation library
  • Loading branch information
orihoogi authored Aug 16, 2023
2 parents 3c4fc4f + bfab09f commit f92afae
Show file tree
Hide file tree
Showing 8 changed files with 490 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
.vscode
**/.venv
output.json
log.txt
log.txt
.idea
57 changes: 57 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,60 @@ Agents are the main component of the library. Agents can perform complex tasks t

### Prebuilt (WIP)
A collection of ready-made agents that can be easily integrated with your application.

### Evaluation (WIP)
A collection of evaluation tools for agents and engines.
## Example
```go
package main

import (
"fmt"
"os"

"github.com/natexcvi/go-llm/engines"
"github.com/natexcvi/go-llm/evaluation"
)

func goodness(_ *engines.ChatPrompt, _ *engines.ChatMessage, err error) float64 {
if err != nil {
return 0
}

return 1
}

func main() {
engine := engines.NewGPTEngine(os.Getenv("OPENAI_TOKEN"), "gpt-3.5-turbo-0613")
engineRunner := evaluation.NewLLMRunner(engine)

evaluator := evaluation.NewEvaluator(engineRunner, &evaluation.Options[*engines.ChatPrompt, *engines.ChatMessage]{
GoodnessFunction: goodness,
Repetitions: 5,
})

testPack := []*engines.ChatPrompt{
{
History: []*engines.ChatMessage{
{
Text: "Hello, how are you?",
},
{
Text: "I'm trying to understand how this works.",
},
},
},
{
History: []*engines.ChatMessage{
{
Text: "Could you please explain it to me?",
},
},
},
}

results := evaluator.Evaluate(testPack)
fmt.Println("Goodness level of the first prompt:", results[0])
fmt.Println("Goodness level of the second prompt:", results[1])
}
```
20 changes: 20 additions & 0 deletions evaluation/agent_evaluator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package evaluation

import (
"github.com/natexcvi/go-llm/agents"
)

type agentRunner[Input, Output any] struct {
agent agents.Agent[Input, Output]
}

// Returns a new agent runner that can be used to evaluate the output.
func NewAgentRunner[Input, Output any](agent agents.Agent[Input, Output]) Runner[Input, Output] {
return &agentRunner[Input, Output]{
agent: agent,
}
}

func (t *agentRunner[Input, Output]) Run(input Input) (Output, error) {
return t.agent.Run(input)
}
104 changes: 104 additions & 0 deletions evaluation/evaluator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package evaluation

import (
"fmt"
"github.com/samber/mo"
)

// GoodnessFunction is a function that takes an input, an output and an error (if one occurred) and returns a float64
// which represents the goodness score of the output.
type GoodnessFunction[Input, Output any] func(input Input, output Output, err error) float64

// Options is a struct that contains the options for the evaluator.
type Options[Input, Output any] struct {
// The goodness function that will be used to evaluate the output.
GoodnessFunction GoodnessFunction[Input, Output]
// The number of times the test will be repeated. The goodness level of each output will be
// averaged.
Repetitions int
}

// Runner is an interface that represents a test runner that will be used to evaluate the output.
// It takes an input and returns an output and an error.
type Runner[Input, Output any] interface {
Run(input Input) (Output, error)
}

// Evaluator is a struct that runs the tests and evaluates the outputs.
type Evaluator[Input, Output any] struct {
options *Options[Input, Output]
runner Runner[Input, Output]
}

// Creates a new `Evaluator` with the provided configuration.
func NewEvaluator[Input, Output any](runner Runner[Input, Output], options *Options[Input, Output]) *Evaluator[Input, Output] {
return &Evaluator[Input, Output]{
options: options,
runner: runner,
}
}

// Runs the tests and evaluates the outputs. The function receives a test pack
// which is a slice of inputs and returns a slice of float64 which represents the goodness level
// of each respective output.
func (e *Evaluator[Input, Output]) Evaluate(testPack []Input) []float64 {
repetitionChannels := make([]chan []float64, e.options.Repetitions)

for i := 0; i < e.options.Repetitions; i++ {
repetitionChannels[i] = make(chan []float64)
go func(i int) {
report, err := e.evaluate(testPack)
if err != nil {
repetitionChannels[i] <- nil
return
}
repetitionChannels[i] <- report
}(i)
}

responses := make([][]float64, e.options.Repetitions)
for i := 0; i < e.options.Repetitions; i++ {
responses[i] = <-repetitionChannels[i]
}

report := make([]float64, len(testPack))
for i := 0; i < len(testPack); i++ {
sum := 0.0
for j := 0; j < e.options.Repetitions; j++ {
sum += responses[j][i]
}
report[i] = sum / float64(e.options.Repetitions)
}

return report
}

func (e *Evaluator[Input, Output]) evaluate(testPack []Input) ([]float64, error) {
responses, err := e.test(testPack)
if err != nil {
return nil, fmt.Errorf("failed to test: %w", err)
}

report := make([]float64, len(testPack))
for i, response := range responses {
res, resErr := response.Get()
report[i] = e.options.GoodnessFunction(testPack[i], res, resErr)
}

return report, nil
}

func (e *Evaluator[Input, Output]) test(testPack []Input) ([]mo.Result[Output], error) {
responses := make([]mo.Result[Output], len(testPack))

for i, test := range testPack {
response, err := e.runner.Run(test)
if err != nil {
responses[i] = mo.Err[Output](err)
} else {
responses[i] = mo.Ok(response)
}
}

return responses, nil
}
Loading

0 comments on commit f92afae

Please sign in to comment.