eval

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 14, 2026 License: MIT Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AppendHistory

func AppendHistory(path string, report Report) error

AppendHistory appends one jsonl record per model in report.

func BuildPrompt

func BuildPrompt(source string) string

BuildPrompt builds a strict JSON extraction prompt.

func RenderLeaderboardMarkdown

func RenderLeaderboardMarkdown(rows []LeaderboardRow) string

RenderLeaderboardMarkdown renders leaderboard as a non-table list to avoid truncation in narrow terminals/renderers.

func RenderMarkdown

func RenderMarkdown(r Report) string

RenderMarkdown returns a concise markdown report.

func SortByRecallDesc

func SortByRecallDesc(models []ModelResult)

SortByRecallDesc sorts models from best recall to worst.

Types

type Case

type Case struct {
	ID           string
	Title        string
	Source       string
	GoldInsights []string
}

Case is a single evaluation sample.

func LoadDataset

func LoadDataset(dir string) ([]Case, error)

LoadDataset loads source_*.txt and gold_*.json pairs from a directory.

type CaseResult

type CaseResult struct {
	CaseID    string       `json:"case_id"`
	RawOutput string       `json:"raw_output"`
	Parsed    ParsedOutput `json:"parsed"`
	Score     Score        `json:"score"`
	TTFMS     int64        `json:"ttf_ms,omitempty"`
	Error     string       `json:"error,omitempty"`
}

CaseResult is one model response + score for one case.

type LeaderboardRow

type LeaderboardRow struct {
	ModelID             string
	Runs                int
	AverageRecall       float64
	BestRecall          float64
	AverageCoverage     float64
	TotalContradictions int
	OverallPassRate     float64
	LastSeen            time.Time
}

LeaderboardRow aggregates scores across run history.

func BuildLeaderboard

func BuildLeaderboard(records []RunRecord) []LeaderboardRow

BuildLeaderboard builds per-model aggregates sorted by performance.

type ModelResult

type ModelResult struct {
	ModelID          string       `json:"model_id"`
	Cases            []CaseResult `json:"cases"`
	Summary          ModelSummary `json:"summary"`
	ElapsedMS        int64        `json:"elapsed_ms"`
	CompletionTokens int          `json:"completion_tokens"`
	TokensPerSec     float64      `json:"tokens_per_sec"`
	AvgTTFMS         int64        `json:"avg_ttf_ms"`
}

ModelResult includes all cases for one model.

type ModelSummary

type ModelSummary struct {
	AverageRecall       float64 `json:"average_recall"`
	AverageCoverage     float64 `json:"average_quote_coverage"`
	TotalContradictions int     `json:"total_contradictions"`
	FormatPassRate      float64 `json:"format_pass_rate"`
	OverallPass         bool    `json:"overall_pass"`
}

ModelSummary aggregates case-level scores for a model.

func BuildModelSummary

func BuildModelSummary(cases []CaseResult, recallThreshold float64) ModelSummary

BuildModelSummary aggregates results from a model's cases.

type ParsedOutput

type ParsedOutput struct {
	TLDR           string   `json:"tldr"`
	KeyInsights    []string `json:"key_insights"`
	EvidenceQuotes []string `json:"evidence_quotes"`
}

ParsedOutput is the normalized model output for scoring.

func ParseOutput

func ParseOutput(raw string) (ParsedOutput, error)

ParseOutput parses model output into ParsedOutput.

type Report

type Report struct {
	GeneratedAt     time.Time     `json:"generated_at"`
	DatasetPath     string        `json:"dataset_path"`
	RecallThreshold float64       `json:"recall_threshold"`
	Models          []ModelResult `json:"models"`
}

Report is the top-level evaluation artifact.

type RunRecord

type RunRecord struct {
	GeneratedAt     time.Time `json:"generated_at"`
	DatasetPath     string    `json:"dataset_path"`
	RecallThreshold float64   `json:"recall_threshold"`
	ModelID         string    `json:"model_id"`
	CaseCount       int       `json:"case_count"`
	AverageRecall   float64   `json:"average_recall"`
	AverageCoverage float64   `json:"average_quote_coverage"`
	Contradictions  int       `json:"total_contradictions"`
	FormatPassRate  float64   `json:"format_pass_rate"`
	OverallPass     bool      `json:"overall_pass"`
}

RunRecord stores one model result from one eval run.

func FilterHistory

func FilterHistory(records []RunRecord, datasetPath string, recallThreshold float64) []RunRecord

FilterHistory keeps records matching dataset and recall threshold.

func LoadHistory

func LoadHistory(path string) ([]RunRecord, error)

LoadHistory reads jsonl run records.

type Score

type Score struct {
	Recall           float64 `json:"recall"`
	MissingInsights  int     `json:"missing_insights"`
	Contradictions   int     `json:"contradictions"`
	QuoteCoverage    float64 `json:"quote_coverage"`
	FormatCompliant  bool    `json:"format_compliant"`
	Pass             bool    `json:"pass"`
	MatchedGoldCount int     `json:"matched_gold_count"`
}

Score contains scoring metrics for one case.

func ScoreCase

func ScoreCase(c Case, out ParsedOutput, recallThreshold float64) Score

ScoreCase computes simple extraction quality metrics.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL