eval

package

v1.2.0 Latest Latest Go to latest Published: Feb 14, 2026 License: MIT Imports: 10 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/dotcommander/syn

Links

Open Source Insights

Documentation ¶

Index ¶

func AppendHistory(path string, report Report) error
func BuildPrompt(source string) string
func RenderLeaderboardMarkdown(rows []LeaderboardRow) string
func RenderMarkdown(r Report) string
func SortByRecallDesc(models []ModelResult)
type Case
- func LoadDataset(dir string) ([]Case, error)
type CaseResult
type LeaderboardRow
- func BuildLeaderboard(records []RunRecord) []LeaderboardRow
type ModelResult
type ModelSummary
- func BuildModelSummary(cases []CaseResult, recallThreshold float64) ModelSummary
type ParsedOutput
- func ParseOutput(raw string) (ParsedOutput, error)
type Report
type RunRecord
- func FilterHistory(records []RunRecord, datasetPath string, recallThreshold float64) []RunRecord
- func LoadHistory(path string) ([]RunRecord, error)
type Score
- func ScoreCase(c Case, out ParsedOutput, recallThreshold float64) Score

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func AppendHistory ¶

func AppendHistory(path string, report Report) error

AppendHistory appends one jsonl record per model in report.

func BuildPrompt ¶

func BuildPrompt(source string) string

BuildPrompt builds a strict JSON extraction prompt.

func RenderLeaderboardMarkdown ¶

func RenderLeaderboardMarkdown(rows []LeaderboardRow) string

RenderLeaderboardMarkdown renders leaderboard as a non-table list to avoid truncation in narrow terminals/renderers.

func RenderMarkdown ¶

func RenderMarkdown(r Report) string

RenderMarkdown returns a concise markdown report.

func SortByRecallDesc ¶

func SortByRecallDesc(models []ModelResult)

SortByRecallDesc sorts models from best recall to worst.

Types ¶

type Case ¶

type Case struct {
	ID           string
	Title        string
	Source       string
	GoldInsights []string
}

Case is a single evaluation sample.

func LoadDataset ¶

func LoadDataset(dir string) ([]Case, error)

LoadDataset loads source_*.txt and gold_*.json pairs from a directory.

type CaseResult ¶

type CaseResult struct {
	CaseID    string       `json:"case_id"`
	RawOutput string       `json:"raw_output"`
	Parsed    ParsedOutput `json:"parsed"`
	Score     Score        `json:"score"`
	TTFMS     int64        `json:"ttf_ms,omitempty"`
	Error     string       `json:"error,omitempty"`
}

CaseResult is one model response + score for one case.

type LeaderboardRow ¶

type LeaderboardRow struct {
	ModelID             string
	Runs                int
	AverageRecall       float64
	BestRecall          float64
	AverageCoverage     float64
	TotalContradictions int
	OverallPassRate     float64
	LastSeen            time.Time
}

LeaderboardRow aggregates scores across run history.

func BuildLeaderboard ¶

func BuildLeaderboard(records []RunRecord) []LeaderboardRow

BuildLeaderboard builds per-model aggregates sorted by performance.

type ModelResult ¶

type ModelResult struct {
	ModelID          string       `json:"model_id"`
	Cases            []CaseResult `json:"cases"`
	Summary          ModelSummary `json:"summary"`
	ElapsedMS        int64        `json:"elapsed_ms"`
	CompletionTokens int          `json:"completion_tokens"`
	TokensPerSec     float64      `json:"tokens_per_sec"`
	AvgTTFMS         int64        `json:"avg_ttf_ms"`
}

ModelResult includes all cases for one model.

type ModelSummary ¶

type ModelSummary struct {
	AverageRecall       float64 `json:"average_recall"`
	AverageCoverage     float64 `json:"average_quote_coverage"`
	TotalContradictions int     `json:"total_contradictions"`
	FormatPassRate      float64 `json:"format_pass_rate"`
	OverallPass         bool    `json:"overall_pass"`
}

ModelSummary aggregates case-level scores for a model.

func BuildModelSummary ¶

func BuildModelSummary(cases []CaseResult, recallThreshold float64) ModelSummary

BuildModelSummary aggregates results from a model's cases.

type ParsedOutput ¶

type ParsedOutput struct {
	TLDR           string   `json:"tldr"`
	KeyInsights    []string `json:"key_insights"`
	EvidenceQuotes []string `json:"evidence_quotes"`
}

ParsedOutput is the normalized model output for scoring.

func ParseOutput ¶

func ParseOutput(raw string) (ParsedOutput, error)

ParseOutput parses model output into ParsedOutput.

type Report ¶

type Report struct {
	GeneratedAt     time.Time     `json:"generated_at"`
	DatasetPath     string        `json:"dataset_path"`
	RecallThreshold float64       `json:"recall_threshold"`
	Models          []ModelResult `json:"models"`
}

Report is the top-level evaluation artifact.

type RunRecord ¶

type RunRecord struct {
	GeneratedAt     time.Time `json:"generated_at"`
	DatasetPath     string    `json:"dataset_path"`
	RecallThreshold float64   `json:"recall_threshold"`
	ModelID         string    `json:"model_id"`
	CaseCount       int       `json:"case_count"`
	AverageRecall   float64   `json:"average_recall"`
	AverageCoverage float64   `json:"average_quote_coverage"`
	Contradictions  int       `json:"total_contradictions"`
	FormatPassRate  float64   `json:"format_pass_rate"`
	OverallPass     bool      `json:"overall_pass"`
}

RunRecord stores one model result from one eval run.

func FilterHistory ¶

func FilterHistory(records []RunRecord, datasetPath string, recallThreshold float64) []RunRecord

FilterHistory keeps records matching dataset and recall threshold.

func LoadHistory ¶

func LoadHistory(path string) ([]RunRecord, error)

LoadHistory reads jsonl run records.

type Score ¶

type Score struct {
	Recall           float64 `json:"recall"`
	MissingInsights  int     `json:"missing_insights"`
	Contradictions   int     `json:"contradictions"`
	QuoteCoverage    float64 `json:"quote_coverage"`
	FormatCompliant  bool    `json:"format_compliant"`
	Pass             bool    `json:"pass"`
	MatchedGoldCount int     `json:"matched_gold_count"`
}

Score contains scoring metrics for one case.

func ScoreCase ¶

func ScoreCase(c Case, out ParsedOutput, recallThreshold float64) Score

ScoreCase computes simple extraction quality metrics.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL